Exercise sheet 5

2025-12-09 21:48:17 +01:00 · 2025-12-09 21:48:17 +01:00 · 582fc9b47a
commit 582fc9b47a
parent 31d9c8900e
25 changed files with 2310 additions and 0 deletions
--- a/ex5/ex5_1/Makefile
+++ b/ex5/ex5_1/Makefile
@ -0,0 +1,30 @@
 #
 # use GNU-Compiler tools
 COMPILER=GCC_
 # alternatively from the shell
 # export COMPILER=GCC_
 # or, alternatively from the shell
 # make COMPILER=GCC_
 # use Intel compilers
 #COMPILER=ICC_
 # use PGI compilers
 # COMPILER=PGI_
 SOURCES = main.cpp mylib.cpp
 OBJECTS = $(SOURCES:.cpp=.o)
 PROGRAM	= main.${COMPILER}
 # uncomment the next to lines for debugging and detailed performance analysis
 CXXFLAGS += -g
 LINKFLAGS += -g
 # do not use -pg with PGI compilers
 ifndef COMPILER
  COMPILER=GCC_
 endif
 include ../${COMPILER}default.mk
--- a/ex5/ex5_1/check_env.h
+++ b/ex5/ex5_1/check_env.h
@ -0,0 +1,99 @@
 #pragma once
 #include <iostream>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #include <unordered_map>
 //#####################################
 // G.Haase
 // See https://sourceforge.net/p/predef/wiki/Compilers/
 //     http://www.cplusplus.com/doc/tutorial/preprocessor/
 //  also:  export OMP_DISPLAY_ENV=VERBOSE
 //#####################################
 /** 	Checks for compilers, its versions, threads etc.
 * 
 	@param[in] argc	number of command line arguemnts
 	@param[in] argv	command line arguments as array of C-strings
 */
 template <class T>
 void check_env(T argc, char const *argv[])
 {
    std::cout << "\n#######################################################################\n";
    std::cout << "Code    :";
    for (T k = 0; k < argc; ++k) std::cout << "  " << argv[k];
    std::cout << std::endl;
    // compiler:      https://sourceforge.net/p/predef/wiki/Compilers/
    std::cout <<    "Compiler:  ";
 #if defined __INTEL_COMPILER
 #pragma message(" ##########  INTEL  ###############")
    std::cout << "INTEL " << __INTEL_COMPILER;
    // Ignore warnings for #pragma acc   unrecognice
 #pragma warning disable 161
    // Ignore warnings for #pragma omp   unrecognice
 #pragma warning disable 3180
 #elif defined __PGI
 #pragma message(" ##########  PGI    ###############")
    std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
 #elif defined  __clang__
 #pragma message(" ##########  CLANG    ###############")
    std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
 #elif defined __GNUC__
 #pragma message(" ##########  Gnu    ###############")
    std::cout << "Gnu " <<  __GNUC__  << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
 #else
 #pragma message(" ##########  unknown Compiler   ###############")
    std::cout << "unknown";
 #endif
    std::cout << "  C++ standard: " << __cplusplus << std::endl;
    // Parallel environments
    std::cout <<    "Parallel:  ";
 #if defined MPI_VERSION
 #pragma message(" ##########  MPI    ###############")
 #ifdef OPEN_MPI
    std::cout << "OpenMPI ";
 #else
    std::cout << "MPI ";
 #endif
    std::cout << MPI_VERSION << "." << MPI_SUBVERSION << "   ";
 #endif
 #ifdef _OPENMP
 //https://www.openmp.org/specifications/
 //https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
    std::unordered_map<unsigned, std::string> const map{
        {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
 #pragma message(" ##########  OPENMP    ###############")
    //std::cout << _OPENMP;
    std::cout << "OpenMP ";
    try {
        std::cout << map.at(_OPENMP);
    }
    catch (...) {
        std::cout << _OPENMP;
    }
    #pragma omp parallel
    {
        #pragma omp master
        {
            const int nn = omp_get_num_threads();          // OpenMP
            std::cout << " ---> " <<  nn << " Threads   ";
        }
        #pragma omp barrier
    }
 #endif
 #ifdef _OPENACC
 #pragma message(" ##########  OPENACC    ###############")
    std::cout << "OpenACC   ";
 #endif
    std::cout << std::endl;
    std::cout << "Date    :  " << __DATE__ << "  " << __TIME__;
    std::cout << "\n#######################################################################\n";
 }
 // HG
--- a/ex5/ex5_1/main.cpp
+++ b/ex5/ex5_1/main.cpp
@ -0,0 +1,142 @@
 #include "check_env.h"
 #include "mylib.h"
 #include <cstdlib>          // atoi()
 #include <cstring>          // strncmp()
 #include <ctime>
 #include <iostream>
 #include <omp.h>            // OpenMP
 #include <sstream>
 #include <string>
 using namespace std;
 int main(int argc, char const *argv[])
 {
    omp_set_schedule(omp_sched_static, 2000000);
    //omp_set_schedule(omp_sched_dynamic, 1000000);
    //omp_set_schedule(omp_sched_guided, 1000000);
    //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
    // Speedup for different number of cores (incl. hyperthreading)
    omp_set_num_threads(8);
    // Print number of available processors
    cout << "Number of available processors: " << omp_get_num_procs() << endl;
    // Currently executing parallel code? -> no
    cout << "Currently in parallel? " << omp_in_parallel() << endl;
    int const NLOOPS = 10;        // chose a value such that the benchmark runs at least 10 sec.
    unsigned int N = 500000001;
 //##########################################################################
 //   Read Parameter from command line  (C++ style)
    cout << "Checking command line parameters for: -n <number> " << endl;
    for (int i = 1; i < argc; i++)
    {
        cout << " arg[" << i << "] = " << argv[i] << endl;
        string ss(argv[i]);
        if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
        {
            N = static_cast<unsigned int>(atoi(argv[i + 1]));
        }
        else
        {
            cout << "Corect call: " << argv[0] << " -n  <number>\n";
        }
    }
    cout << "\nN = " << N << endl;
    check_env(argc, argv);
 //########################################################################
    int nthreads;                                  // OpenMP
    #pragma omp parallel default(none) shared(cout,nthreads)
    {
        stringstream inparallel;
        inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
        int const th_id  = omp_get_thread_num();   // OpenMP
        int const nthrds = omp_get_num_threads();  // OpenMP
        stringstream ss;
        ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
        #pragma omp critical
        {
            cout << ss.str();                      // output to a shared ressource
            cout << inparallel.str() << endl;
        }
        #pragma omp master
        nthreads = nthrds;                         // transfer nn to to master thread
    }
    cout << "   " << nthreads << "   threads have been started." << endl;
 //##########################################################################
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> x(N), y(N);
    cout.precision(2);
    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //##########################################################################
 //  Data initialization
 //  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
    for (unsigned int i = 0; i < N; ++i)
    {
        x[i] = i + 1;
        y[i] = 1.0 / x[i];
    }
 //##########################################################################
    cout << "\nStart Benchmarking\n";
 // Do calculation
    double tstart = omp_get_wtime();                  // OpenMP
    double sk(0.0);
    for (int i = 0; i < NLOOPS; ++i)
    {
        //sk = scalar(x, y);
        sk = scalar_parallel(x, y);
        //sk = scalar_trans(x, y);
        //sk = norm(x);
    }
    double t1 = omp_get_wtime() - tstart;             // OpenMP
    t1 /= NLOOPS;           // divide by number of function calls
 //##########################################################################
 // Check the correct result
    cout << "\n <x,y> = " << sk << endl;
    if (static_cast<unsigned int>(sk) != N)
    {
        cout << "  !!   W R O N G  result   !!\n";
    }
    cout << endl;
 //##########################################################################
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    cout << "Total benchmarking time: " << t1*NLOOPS << endl;
    cout << "Timing in sec. : " << t1 << endl;
    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
 //#########################################################################
    cout << "\n  Try the reduction with an STL-vektor \n";
    auto vr = reduction_vec_append(5);
    cout << "done\n";
    cout << vr << endl;
    return 0;
 }  // memory for x and y will be deallocated their destructors
--- a/ex5/ex5_1/mylib.cpp
+++ b/ex5/ex5_1/mylib.cpp
@ -0,0 +1,137 @@
 #include "mylib.h"
 #include <cassert>       // assert()
 #include <cmath>
 #include <iostream>
 #include <functional>    // multiplies<>{}
 #include <list>
 #include <numeric>       // iota()
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #include <vector>
 using namespace std;
 double scalar_parallel(vector<double> const &x, vector<double> const &y)
 {
    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
    size_t const N = x.size();
    double sum = 0.0;
 #pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
    {
        const size_t nthreads = omp_get_num_threads();
        const size_t threadnum = omp_get_thread_num();
        const size_t chunksize = N/nthreads;
        size_t start = threadnum*chunksize;
        size_t end = start + chunksize;
        if (threadnum == nthreads - 1)
            end = N;
        for (size_t i = start; i < end; ++i)
        {
            sum += x[i] * y[i];
        }
    }
    return sum;
 }
 vector<int> reduction_vec_append(int n)
 { 
    vector<int> vec(n);
 #pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
    {
        #pragma omp barrier
        #pragma omp critical
        cout << omp_get_thread_num() << " : " << vec.size() << endl;
        #pragma omp barrier
        iota( vec.begin(),vec.end(), omp_get_thread_num() );
        #pragma omp barrier
    }
    return vec;
 }
 double scalar(vector<double> const &x, vector<double> const &y)
 {
    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
    size_t const N = x.size();
    double sum = 0.0;
 #pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
    for (size_t i = 0; i < N; ++i)
    {
        sum += x[i] * y[i];
        //sum += exp(x[i])*log(y[i]);
    }
    return sum;
 }
 double norm(vector<double> const &x)
 {
    size_t const N = x.size();
    double sum = 0.0;
 #pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
    for (size_t i = 0; i < N; ++i)
    {
        sum += x[i]*x[i];
    }
    return sum;
 }
 vector<int> reduction_vec(int n)
 { 
    vector<int> vec(n);
 #pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
    {
        #pragma omp barrier
        #pragma omp critical
        cout << omp_get_thread_num() << " : " << vec.size() << endl;
        #pragma omp barrier
        iota( vec.begin(),vec.end(), omp_get_thread_num() );
        #pragma omp barrier
    }
    return vec;
 }
 double scalar_trans(vector<double> const &x, vector<double> const &y)
 {
    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
    vector<double> z(x.size());
    //list<double> z(x.size());  // parallel for-loop on iterators not possible (missing 'operator-')
                                 // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison 
    transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
    double sum = 0.0;
 #pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
    for (auto pi = cbegin(z); pi!=cend(z); ++pi)
    {
        sum += *pi;
    }
    //for (auto val: z)
    //{
        //sum += val;
    //}    
    return sum;
 }
--- a/ex5/ex5_1/mylib.h
+++ b/ex5/ex5_1/mylib.h
@ -0,0 +1,88 @@
 #pragma once
 #include <cassert>
 #include <iomanip>             // setw()
 #include <iostream>
 #include <omp.h>
 #include <vector>
 /** 	Inner product
 	@param[in] x	vector
 	@param[in] y	vector
 	@return 	    resulting Euclidian inner product <x,y>
 */
 double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
 double scalar(std::vector<double> const &x, std::vector<double> const &y);
 double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
 // Declare additional reduction operation in OpenMP for STL-vector
 #pragma omp declare reduction(VecAppend : std::vector<int>  : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
 initializer (omp_priv=omp_orig)
 std::vector<int> reduction_vec_append(int n);
 /** 	l2-norm
 	@param[in] x	vector
 	@return 	    resulting Euclidian norm
 */
 double norm(std::vector<double> const &x);
 /** 	 Vector @p b adds its elements to vector @p a .
 	@param[in] a	vector
 	@param[in] b	vector
 	@return 	     a+=b componentwise
 */
 template<class T>
 std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
 {
    assert(a.size()==b.size());
    for (size_t k = 0; k < a.size(); ++k) {
        a[k] += b[k];
    }
    return a;
 }
 // Declare the reduction operation in OpenMP for an STL-vector
 //   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
 // ------------------------------------------------------------
 // https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
 // https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
 // https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
 //  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
 #pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
 initializer (omp_priv=omp_orig)
 //   Templates are  n o t  possible, i.e. the reduction has to be declared fore a specified type.
 //template <class T>
 //#pragma omp declare reduction(VecAdd : std::vector<T>  : omp_out += omp_in) initializer (omp_priv(omp_orig))
 // MS: template nach #pragma  !?
 // ------------------------------------------------------------
 /** 	Test for vector reduction.
 * 
 * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
 * Afterwards these vectors are accumulated, i.e., 
 * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
 * 
 * 	@param[in] n  size of global/private vector
 * 	@return  resulting global vector.
 */
 std::vector<int> reduction_vec(int n);
 /** 	Output of a vector.
 	@param[in,out] s	output stream
 	@param[in]     x	vector
 	@return 	   modified output stream
 */
 template <class T>
 std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
 {
    for (auto const &v : x)  s << std::setw(4) << v << "  ";
    return s;
 }
--- a/ex5/ex5_1/timing.h
+++ b/ex5/ex5_1/timing.h
@ -0,0 +1,70 @@
 #pragma once
 #include <chrono>                  // timing
 #include <stack>
 using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
 //using Clock = std::chrono::high_resolution_clock;
 using TPoint= std::chrono::time_point<Clock>;
 // [Galowicz, C++17 STL Cookbook, p. 29]
 inline 
 std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
 /** Starts stopwatch timer.
 *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
 * 
 *  The timining is allowed to be nested and the recent time is stored on top of the stack.
 * 
 *  @return recent time
 *  @see toc
 */
 inline auto tic()
 {
    MyStopWatch.push(Clock::now());
    return MyStopWatch.top();
 }
 /** Returns the elapsed time from stopwatch. 
 * 
 * The time from top of the stack is used
 * if time point @p t_b is not passed as input parameter.
 * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
 * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
 * The last option is to be used in the case of 
 * non-nested but overlapping time measurements.
 * 
 * @param[in]  t_b start time of some stop watch
 * @return elapsed time in seconds.
 *
 */ 
 inline double toc(TPoint const &t_b = MyStopWatch.top())
 {
    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
    using Unit      = std::chrono::seconds;
    using FpSeconds = std::chrono::duration<double, Unit::period>;        
    auto t_e = Clock::now();
    MyStopWatch.pop();
    return FpSeconds(t_e-t_b).count();
 }
 #include <iostream>
 #include <string>
 /** Executes function @p f and measures/prints elapsed wall clock time in seconds
 * 
 * Call as
 * @code measure("Time for (b = b + 1)", [&]() {
        thrust::transform(b.begin(), b.end(), b.begin(), increment());
    }); @endcode
 * 
 * @param[in]  label  additional string to be printed with the measurement.
 * @param[in]  f      function to execute.
 * @author Therese Bösmüller, 2025
 *
 */ 
 auto measure = [](const std::string& label, auto&& f) {
        auto start    = std::chrono::high_resolution_clock::now();
        f();
        auto stop     = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
        std::cout << label << ": " << duration << " microseconds" << std::endl;
 };          // ';' is needed for a visible documentation of this lambda-function
--- a/ex5/ex5_2/Makefile
+++ b/ex5/ex5_2/Makefile
@ -0,0 +1,31 @@
 #
 # use GNU-Compiler tools
 COMPILER=GCC_
 # alternatively from the shell
 # export COMPILER=GCC_
 # or, alternatively from the shell
 # make COMPILER=GCC_
 # use Intel compilers
 #COMPILER=ICC_
 # use PGI compilers
 # COMPILER=PGI_
 SOURCES = main.cpp mylib.cpp
 OBJECTS = $(SOURCES:.cpp=.o)
 PROGRAM	= main.${COMPILER}
 # uncomment the next to lines for debugging and detailed performance analysis
 CXXFLAGS += -g
 LINKFLAGS += -g
 # do not use -pg with PGI compilers
 ifndef COMPILER
  COMPILER=GCC_
 endif
 include ../${COMPILER}default.mk
--- a/ex5/ex5_2/data_1.txt
+++ b/ex5/ex5_2/data_1.txt
@ -0,0 +1,501 @@
 141
 261
 87
 430
 258
 298
 425
 120
 496
 707
 244
 786
 75
 394
 4
 221
 2
 190
 143
 269
 175
 139
 599
 902
 940
 222
 483
 377
 524
 265
 69
 437
 174
 27
 955
 431
 962
 763
 8
 681
 706
 646
 553
 219
 773
 229
 371
 891
 857
 403
 319
 609
 911
 910
 592
 333
 854
 443
 905
 34
 533
 717
 180
 337
 188
 322
 404
 549
 49
 553
 275
 242
 244
 155
 957
 936
 819
 729
 176
 361
 189
 2
 317
 700
 626
 544
 440
 288
 502
 762
 763
 577
 748
 646
 124
 505
 348
 93
 148
 199
 673
 432
 695
 257
 10
 533
 280
 947
 907
 393
 25
 672
 838
 972
 57
 451
 583
 687
 720
 651
 727
 374
 582
 117
 58
 980
 285
 595
 963
 186
 194
 342
 933
 391
 274
 152
 398
 375
 132
 436
 92
 615
 11
 574
 790
 236
 449
 570
 62
 497
 643
 222
 838
 972
 847
 506
 279
 747
 237
 958
 621
 601
 173
 91
 256
 859
 912
 700
 726
 230
 577
 811
 404
 989
 90
 321
 512
 61
 726
 557
 530
 830
 859
 790
 318
 453
 753
 110
 110
 270
 525
 973
 711
 312
 292
 851
 912
 640
 256
 89
 839
 585
 949
 62
 585
 286
 828
 191
 443
 394
 827
 677
 208
 319
 134
 672
 571
 170
 148
 477
 909
 553
 33
 54
 806
 452
 383
 790
 365
 533
 712
 872
 329
 651
 975
 76
 588
 414
 310
 264
 759
 996
 187
 782
 196
 993
 803
 425
 729
 499
 809
 357
 74
 591
 911
 194
 433
 750
 40
 947
 764
 559
 184
 498
 518
 995
 855
 963
 679
 404
 935
 480
 232
 397
 706
 559
 757
 996
 963
 536
 964
 116
 52
 305
 581
 531
 902
 541
 432
 543
 713
 17
 801
 143
 479
 257
 370
 662
 170
 279
 199
 196
 327
 881
 472
 404
 180
 969
 408
 845
 616
 377
 878
 785
 465
 814
 899
 430
 335
 597
 902
 703
 378
 735
 955
 543
 541
 312
 72
 182
 93
 464
 10
 916
 643
 2
 31
 209
 455
 128
 9
 728
 355
 781
 437
 437
 50
 50
 92
 595
 242
 842
 858
 964
 489
 221
 227
 537
 763
 348
 462
 640
 918
 162
 716
 578
 434
 885
 394
 179
 634
 625
 328
 803
 1000
 981
 128
 233
 24
 608
 111
 408
 885
 549
 370
 209
 441
 957
 125
 471
 857
 44
 692
 979
 284
 134
 686
 910
 611
 900
 194
 755
 347
 419
 156
 820
 625
 739
 806
 68
 951
 498
 756
 743
 832
 157
 458
 619
 933
 836
 896
 583
 583
 855
 35
 886
 408
 37
 747
 155
 144
 606
 255
 325
 402
 407
 387
 610
 167
 189
 95
 324
 770
 235
 741
 693
 825
 828
 294
 310
 524
 326
 832
 811
 557
 263
 681
 234
 457
 385
 539
 992
 756
 981
 235
 529
 52
 757
 602
 858
 989
 930
 410
 1
 541
 208
 220
 326
 96
 748
 749
 544
 339
 833
 553
 958
 893
 357
 547
 347
 623
 797
 746
 126
 823
 26
 415
 732
 782
 368
--- a/ex5/ex5_2/main.GCC_
+++ b/ex5/ex5_2/main.GCC_
--- a/ex5/ex5_2/main.cpp
+++ b/ex5/ex5_2/main.cpp
@ -0,0 +1,130 @@
 #include "mylib.h"
 #include <fstream>
 #include <iostream>
 #include <omp.h>
 #include <vector>
 using namespace std;
 int main()
 {
    // read vector from file
    vector<size_t> data_vector = {};
    ifstream input_stream("data_1.txt"); 
    size_t line;
    while(input_stream >> line)
    {
        data_vector.push_back(line);
    }
    data_vector.shrink_to_fit();
    // specify loops
    size_t NLOOPS = 10000;
    // ############# Parallelization with openMP #############
    // calculate arithmetic mean, geometric mean and harmonic mean
    double am_omp, gm_omp, hm_omp;
    double tstart = omp_get_wtime();
    for (size_t i = 0; i < NLOOPS; ++i)
        means_omp(data_vector, am_omp, gm_omp, hm_omp);
    double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
    // calculate minimum and maximum
    size_t min, max;
    tstart = omp_get_wtime();
    for (size_t i = 0; i < NLOOPS; ++i)
        minmax_omp(data_vector, min, max);
    double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
    // ############# Parallelization with C++ algorithms #############
    // calculate arithmetic mean, geometric mean and harmonic mean
    double am_cpp, gm_cpp, hm_cpp;
    tstart = omp_get_wtime();
    for (size_t i = 0; i < NLOOPS; ++i)
        means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
    double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
    // calculate minimum and maximum
    size_t min_cpp, max_cpp;
    tstart = omp_get_wtime();
    for (size_t i = 0; i < NLOOPS; ++i)
        minmax_cpp(data_vector, min_cpp, max_cpp);
    double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
    // print results
    cout << "####### OpenMP #######" << endl;
    cout << "minimum: " << min << endl;
    cout << "maximum: " << max << endl;
    cout << "duration: " << t_minmax_omp << endl << endl;
    cout << "arithmetic mean: " << am_omp << endl;
    cout << "geometric mean: " << gm_omp << endl;
    cout << "harmonic mean: " << hm_omp << endl;
    cout << "duration: " << t_means_omp << endl << endl;
    cout << "####### C++ #######" << endl;
    cout << "minimum: " << min_cpp << endl;
    cout << "maximum: " << max_cpp << endl;
    cout << "duration: " << t_minmax_cpp << endl << endl;
    cout << "arithmetic mean: " << am_cpp << endl;
    cout << "geometric mean: " << gm_cpp << endl;
    cout << "harmonic mean: " << hm_cpp << endl;
    cout << "duration: " << t_means_cpp << endl << endl;
    // ####### OpenMP #######
    // minimum: 1
    // maximum: 1000
    // duration: 3.52086e-06
    // arithmetic mean: 498.184
    // geometric mean: 364.412
    // harmonic mean: 95.6857
    // duration: 5.90171e-06
    // ####### C++ #######
    // minimum: 1
    // maximum: 1000
    // duration: 1.76816e-05
    // arithmetic mean: 498.184
    // geometric mean: 364.412
    // harmonic mean: 95.6857
    // duration: 2.35728e-05
    // --> the openMP variant is faster in both cases
    return 0;
 }
--- a/ex5/ex5_2/main.o
+++ b/ex5/ex5_2/main.o
--- a/ex5/ex5_2/mylib.cpp
+++ b/ex5/ex5_2/mylib.cpp
@ -0,0 +1,103 @@
 #include "mylib.h"
 #include <algorithm>
 #include <cmath>
 #include <execution>
 #include <iostream>
 #include <numeric>
 #include <omp.h>
 using namespace std;
 void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
 {
    size_t const n = numbers.size();
    am = 0.;
    gm = 0.;
    hm = 0.;
 #pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
    for (size_t i = 0; i < n; ++i)
    {
        am += numbers[i];
        gm += log(numbers[i]);
        hm += 1.0/numbers[i];
        // #pragma omp critical
        // {
        //     cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
        // }
    }
    am /= n;
    gm = exp(gm/n);
    hm = n/hm;
 }
 void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
 {
    size_t const n = numbers.size();
    global_min = -1; // gives the maximum size_t value
    global_max = 0;
 #pragma omp parallel shared(numbers, n, global_min, global_max)
    {
        const size_t nthreads = omp_get_num_threads();
        const size_t threadnum = omp_get_thread_num();
        const size_t chunksize = n/nthreads;
        size_t start = threadnum*chunksize;
        size_t end = start + chunksize;
        if (threadnum == nthreads - 1)
            end = n;
        size_t local_min = -1;
        size_t local_max = 0;
        for (size_t i = start; i < end ; ++i)
        {
            if (numbers[i] < local_min)
                local_min = numbers[i];
            if (numbers[i] > local_max)
                local_max = numbers[i];
        }
        #pragma omp critical
        {
            if (local_min < global_min)
                global_min = local_min;
            if (local_max > global_max)
                global_max = local_max;
        }
    }
 }
 void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
 {
    size_t const n = numbers.size();
    am = reduce(std::execution::par, numbers.begin(), numbers.end());
    gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
    hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
    am /= n;
    gm = exp(gm/n);
    hm = n/hm;
 }
 void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
 {
    auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
    auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
    global_min = *min_it;
    global_max = *max_it;
 }
--- a/ex5/ex5_2/mylib.h
+++ b/ex5/ex5_2/mylib.h
@ -0,0 +1,42 @@
 #include <vector>
 /**
  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
  Uses openMP parallelization.
  @param[in]    numbers      vector containing integers
  @param[out]   am           arithmetic mean
  @param[out]   gm           geometric mean
  @param[out]   hm           harmonic mean
 */
 void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
 /**
  This function calculates the minimum and maximum of a vector.
  Uses openMP parallelization.
  @param[in]    numbers         vector containing integers
  @param[out]   global_min      minimum
  @param[out]   global_max      maximum
 */
 void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
 /**
  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
  Uses C++ parallelization.
  @param[in]    numbers      vector containing integers
  @param[out]   am           arithmetic mean
  @param[out]   gm           geometric mean
  @param[out]   hm           harmonic mean
 */
 void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
 /**
  This function calculates the minimum and maximum of a vector.
  Uses C++ parallelization.
  @param[in]    numbers         vector containing integers
  @param[out]   global_min      minimum
  @param[out]   global_max      maximum
 */
 void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
--- a/ex5/ex5_2/mylib.o
+++ b/ex5/ex5_2/mylib.o
--- a/ex5/ex5_3/Makefile
+++ b/ex5/ex5_3/Makefile
@ -0,0 +1,30 @@
 #
 # use GNU-Compiler tools
 COMPILER=GCC_
 # alternatively from the shell
 # export COMPILER=GCC_
 # or, alternatively from the shell
 # make COMPILER=GCC_
 # use Intel compilers
 #COMPILER=ICC_
 # use PGI compilers
 # COMPILER=PGI_
 SOURCES = main.cpp goldbach.cpp 
 OBJECTS = $(SOURCES:.cpp=.o)
 PROGRAM	= main.${COMPILER}
 # uncomment the next to lines for debugging and detailed performance analysis
 CXXFLAGS += -g
 LINKFLAGS += -g
 # do not use -pg with PGI compilers
 ifndef COMPILER
  COMPILER=GCC_
 endif
 include ../${COMPILER}default.mk
--- a/ex5/ex5_3/goldbach.cpp
+++ b/ex5/ex5_3/goldbach.cpp
@ -0,0 +1,46 @@
 #include "goldbach.h"
 #include <iostream>
 #include <iterator>
 #include <omp.h>
 size_t single_goldbach(size_t k)
 {
    const std::vector<size_t> relevant_primes = get_primes(k);
    size_t m = relevant_primes.size();
    size_t counter = 0;
 #pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
    for(size_t i = 0; i < m; ++i)
    {
        for(size_t j = i; j < m; ++j)
        {
            if(relevant_primes[i] + relevant_primes[j] == k)
                ++counter;
        }
    }
    return counter;
 }
 std::vector<size_t> count_goldbach(size_t n)
 {
    const std::vector<size_t> relevant_primes = get_primes(n);
    size_t m = relevant_primes.size();
    std::vector<size_t> counter_vector(n + 1, 0);
 #pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
    for(size_t i = 0; i < m; ++i)
    {
        for(size_t j = i; j < m; ++j)
        {
            size_t sum = relevant_primes[i] + relevant_primes[j];
            if(sum <= n)
                ++counter_vector[relevant_primes[i] + relevant_primes[j]];
        }
    }
    return counter_vector;
 }
--- a/ex5/ex5_3/goldbach.h
+++ b/ex5/ex5_3/goldbach.h
@ -0,0 +1,45 @@
 #pragma once
 #include "mayer_primes.h"
 #include <cassert>
 #include <vector>
 /**
  This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
  @param[in]    k            first integer
  @param[out]   count        number of decompositions
 */
 size_t single_goldbach(size_t k);
 /**
  This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
  @param[in]    n                   upper integer bound
  @param[out]   count_vector        vector containing the number of decompositions for a natural number the corresponding index
 */
 std::vector<size_t> count_goldbach(size_t n);
 /** 	 Vector @p b adds its elements to vector @p a .
 	@param[in] a	vector
 	@param[in] b	vector
 	@return 	     a+=b componentwise
 */
 template<class T>
 std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
 {
    assert(a.size()==b.size());
    for (size_t k = 0; k < a.size(); ++k) {
        a[k] += b[k];
    }
    return a;
 }
 // Declare the reduction operation in OpenMP for an STL-vector
 //   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
 // ------------------------------------------------------------
 // https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
 // https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
 // https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
 //  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
 #pragma omp declare reduction(VecAdd : std::vector<size_t>  : omp_out += omp_in) initializer (omp_priv=omp_orig)
--- a/ex5/ex5_3/main.cpp
+++ b/ex5/ex5_3/main.cpp
@ -0,0 +1,45 @@
 #include "goldbach.h"
 #include <algorithm>
 #include <iostream>
 #include <omp.h>
 using namespace std;
 int main()
 {
    cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
    for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
    {
        double t_start = omp_get_wtime();
        auto goldbach_vector = count_goldbach(n);
        auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
        size_t max_number = distance(goldbach_vector.begin(), max_it);
        double t_end = omp_get_wtime() - t_start;
        cout << "The number " <<  max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
    }
    /*
    ###### WITHOUT PARALLELIZATION ######
    The number 9240 has 329 decompositions. Duration: 0.00307696
    The number 99330 has 2168 decompositions. Duration: 0.189839
    The number 390390 has 7094 decompositions. Duration: 1.3042
    The number 990990 has 15594 decompositions. Duration: 5.45034
    The number 1981980 has 27988 decompositions. Duration: 47.1807
    ###### WITH PARALLELIZATION ######
    The number 9240 has 329 decompositions. Duration: 0.000734854
    The number 99330 has 2168 decompositions. Duration: 0.0251322
    The number 390390 has 7094 decompositions. Duration: 0.487375
    The number 990990 has 15594 decompositions. Duration: 6.16972
    The number 1981980 has 27988 decompositions. Duration: 31.5699
    */
    return 0;
 }
--- a/ex5/ex5_3/mayer_primes.h
+++ b/ex5/ex5_3/mayer_primes.h
@ -0,0 +1,73 @@
 #pragma once
 #include <cstring> //memset
 #include <vector>
 //using namespace std;
 /** \brief Determines all prime numbers in interval [2, @p max].
 *
 *  The sieve of Eratosthenes is used.
 *
 *  The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
 *
 * \param[in]   max end of interval for the prime number search.
 * \return  vector of prime numbers @f$2,3,5, ..., p<=max @f$.
 *
 * \copyright
 * Copyright (c) 2008 Florian Mayer          (adapted by Gundolf Haase 2018)
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 template <class T>
 std::vector<T> get_primes(T max)
 {
    std::vector<T> primes;
    char *sieve;
    sieve = new char[max / 8 + 1];
    // Fill sieve with 1
    memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
    for (T x = 2; x <= max; x++)
    {
        if (sieve[x / 8] & (0x01 << (x % 8))) {
            primes.push_back(x);
            // Is prime. Mark multiplicates.
            for (T j = 2 * x; j <= max; j += x)
            {
                sieve[j / 8] &= ~(0x01 << (j % 8));
 			}
        }
 	}
    delete[] sieve;
    return primes;
 }
 //---------------------------------------------------------------
 //int main()        // by  Florian Mayer
 //{g++ -O3  -std=c++14 -fopenmp main.cpp && ./a.out
 //    vector<unsigned long> primes;
 //    primes = get_primes(10000000);
 //    // return 0;
 //    // Print out result.
 //    vector<unsigned long>::iterator it;
 //    for(it=primes.begin(); it < primes.end(); it++)
 //        cout << *it << " ";
 //
 //    cout << endl;
 //    return 0;
 //}
--- a/ex5/ex5_4/Makefile
+++ b/ex5/ex5_4/Makefile
@ -0,0 +1,30 @@
 #
 # use GNU-Compiler tools
 COMPILER=GCC_
 # alternatively from the shell
 # export COMPILER=GCC_
 # or, alternatively from the shell
 # make COMPILER=GCC_
 # use Intel compilers
 #COMPILER=ICC_
 # use PGI compilers
 # COMPILER=PGI_
 SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
 OBJECTS = $(SOURCES:.cpp=.o)
 PROGRAM	= main.${COMPILER}
 # uncomment the next to lines for debugging and detailed performance analysis
 CXXFLAGS += -g
 LINKFLAGS += -g
 # do not use -pg with PGI compilers
 ifndef COMPILER
  COMPILER=GCC_
 endif
 include ../${COMPILER}default.mk
--- a/ex5/ex5_4/benchmark_tests.cpp
+++ b/ex5/ex5_4/benchmark_tests.cpp
@ -0,0 +1,375 @@
 #include "benchmark_tests.h"
 #include "benchmarks.h"
 #include <chrono>
 #include <iostream>
 #include <math.h>       
 using namespace std::chrono; 
 vector<double> test_A(const size_t &NLOOPS, const size_t &N)
 {
    cout << "#################### (A) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> x(N), y(N);
    cout.precision(2);
    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
 //  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
    for (size_t i = 0; i < N; ++i)
    {
        x[i] = i % 219 + 1;
        y[i] = 1.0/x[i];
    }
    cout << "\nStart Benchmarking scalar\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    double check(0.0),ss(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        check = scalar_parallel(x, y);
        ss += check;                   // prevents the optimizer from removing unused calculation results.
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n <x,y> = " << check << endl;
    if (static_cast<unsigned int>(check) != N)
        cout << "  !!   W R O N G  result   !!\n";
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
 {
    cout << "#################### (A) sum ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> x(N);
    cout.precision(2);
    cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t i = 0; i < N; ++i)
    {
        x[i] = 1;
    }
    cout << "\nStart Benchmarking sum\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    double check(0.0),ss(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        check = sum(x);
        ss += check;                   // prevents the optimizer from removing unused calculation results.
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n <x,y> = " << check << endl;
    if (static_cast<unsigned int>(check) != N)
        cout << "  !!   W R O N G  result   !!\n";
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
 {
    cout << "#################### (B) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
    cout << "\nM = " << M << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> A(M*N);
    vector<double> x(N);
    cout.precision(2);
    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t i = 0; i < M; ++i)
        for (size_t j = 0; j < N; ++j)
            A[N*i + j] = (i + j) % 219 + 1;
    for (size_t j = 0; j < N; ++j)
    {
        x[j] = 1.0/A[N*17 + j];
    }
    cout << "\nStart Benchmarking MatVec\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> b(M);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        b = MatVec_parallel(A, x);
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n <A[17,*],x> = " << b[17] << endl;
    if (static_cast<size_t>(b[17]) != N)
    {
        cout << "  !!   W R O N G  result   !!\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
 {
    cout << "#################### (C) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nL = " << L << endl;
    cout << "\nM = " << M << endl;
    cout << "\nN = " << N << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> A(M*L);
    vector<double> B(L*N);
    cout.precision(2);
    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t i = 0; i < M; ++i)
        for (size_t k = 0; k < L; ++k)
            A[L*i + k] = (i + k) % 219 + 1;
    for (size_t k = 0; k < L; ++k)
        for (size_t j = 0; j < N; ++j)
            B[N*k + j] = 1.0/A[L*17 + k];
    cout << "\nStart Benchmarking MatMat\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> C(M*N);
    double check;
    double check_sum = 0;
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        C = MatMat_parallel(A, B, L);
        check = C[N*17];
        check_sum += check; // prevents the optimizer from removing unused calculation results.
    }
    cout << check_sum;
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n C[17,0] = " << check << endl;
    if (static_cast<unsigned int>(check) != L)
    {
        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
 vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
 {
    cout << "#################### (D) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
    cout << "\nN = " << N << endl;
    cout << "\np = " << p << endl;
 //  Memory allocation
    cout << "Memory allocation\n";
    vector<double> a(p + 1, 0);
    vector<double> x(N);
    cout.precision(2);
    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
    cout.precision(6);
 //  Data initialization
    for (size_t j = 0; j < N; ++j)
        x[j] = 1.0*j;
    for (size_t k = 0; k < p + 1; ++k)
        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
    cout << "\nStart Benchmarking poly\n";
    auto t1 = system_clock::now(); // start timer
 // Do calculation
    vector<double> y(N);
    double check;
    double check_sum;
    for (size_t i = 0; i < NLOOPS; ++i)
    {
        y = poly_parallel(a, x);
        check = y[0];
        check_sum += check; // prevents the optimizer from removing unused calculation results.
    }
    auto t2 = system_clock::now();  // stop timer
    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
 // Check the correct result
    cout << "\n poly(" << x[0] << ") = " << check << endl;
    if (abs(check - 1.0) > 1.0/1e6)
    {
        cout << "  !!   W R O N G  result   !!\n";
    }
    cout << endl;
 // Timings  and Performance
    cout << endl;
    cout.precision(2);
    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
    cout << "Total duration : " << t_diff*NLOOPS << endl;
    cout << "Timing in sec. : " << t_diff << endl;
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }
--- a/ex5/ex5_4/benchmark_tests.h
+++ b/ex5/ex5_4/benchmark_tests.h
@ -0,0 +1,13 @@
 #pragma once
 #include <vector>
 using namespace std;
 vector<double> test_A(const size_t &NLOOPS, const size_t &N);
 vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
 vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
 vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
 vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex5/ex5_4/benchmarks.cpp
+++ b/ex5/ex5_4/benchmarks.cpp
@ -0,0 +1,141 @@
 #include "benchmarks.h"
 #include <cassert>       // assert()
 #include <cmath>
 #include <iostream>
 #include <vector>
 #include <omp.h>
 // (A) Inner product of two vectors (from skalar_stl)
 double scalar_parallel(vector<double> const &x, vector<double> const &y)
 {
    assert(x.size() == y.size());
    size_t const N = x.size();
    double sum = 0.0;
 //#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
 #pragma omp parallel for shared(x, y, N) reduction(+:sum)
    for (size_t i = 0; i < N; ++i)
    {
        sum += x[i] * y[i];
    }
    return sum;
 }
 // (A) Vector entry sum
 double sum(vector<double> const &x)
 {
    double sum = 0.0;
 #pragma omp parallel for shared(x) reduction(+:sum)
    for (size_t i = 0; i < x.size(); ++i)
    {
        sum += x[i];
    }
    return sum;
 }
 // (B) Matrix-vector product (from intro_vector_densematrix)
 vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
 {
    size_t const nelem = A.size();
    size_t const N = x.size();
    assert(nelem % N == 0); // make sure multiplication is possible
    size_t const M = nelem/N;          
    vector<double> b(M);    
 #pragma omp parallel for shared(A, x, N, M, b)
    for (size_t i = 0; i < M; ++i)
    {
        double tmp = 0.0;
        for (size_t j = 0; j < N; ++j)
            tmp += A[N*i + j] * x[j];
        b[i] = tmp;
    }
    return b;
 }
 // (C) Matrix-matrix product
 vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
 {
    size_t const nelem_A = A.size();
    size_t const nelem_B = B.size();
    assert(nelem_A % L == 0 && nelem_B % L == 0);
    size_t const M = nelem_A/L;
    size_t const N = nelem_B/L;
    vector<double> C(M*N);
 #pragma omp parallel for shared(A, B, M, N, L, C)
    for (size_t i = 0; i < M; ++i)
    {
        for (size_t k = 0; k < L; ++k)
        {
            for (size_t j = 0; j < N; ++j)
            {
                C[N*i + j] += A[L*i + k]*B[N*k + j];
            }
        }
    }
    return C;
 }
 // (D) Evaluation of a polynomial function
 vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
 {
    size_t const N = x.size();
    size_t const p = a.size() - 1;
    vector<double> y(N, 0);
 #pragma omp parallel for shared(a, x, N, p, y)
    for (size_t i = 0; i < N; ++i)
    {
        double x_temp = x[i];
        double y_temp = 0;
        for (size_t k = 0; k < p + 1; ++k)
        {
            y_temp += x_temp*y_temp + a[p - k];
        }
        y[i] = y_temp;
    }
    return y;
 }
--- a/ex5/ex5_4/benchmarks.h
+++ b/ex5/ex5_4/benchmarks.h
@ -0,0 +1,55 @@
 #pragma once
 #include <vector>
 using namespace std;
 /**     (A) Inner product of two vectors (from skalar_stl)
 	@param[in] x	vector
 	@param[in] y	vector
 	@return 	    resulting Euclidian inner product <x,y>
 */
 double scalar_parallel(vector<double> const &x, vector<double> const &y);
 /**     (A) Sum entries of vector
 	@param[in] x	vector
 	@return 	    sum
 */
 double sum(vector<double> const &x);
 /** 	(B) Matrix-vector product (from intro_vector_densematrix)
 * 	@param[in] A	dense matrix (1D access)
 *  @param[in] u	vector
 *
 *	@return    resulting vector
 */
 vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
 /** 	(C) Matrix-matrix product
 * 	@param[in] A			MxL dense matrix (1D access)
 *  @param[in] B			LxN dense matrix (1D access)
 *  @param[in] shared_dim 	shared dimension L
 *
 *	@return    resulting MxN matrix
 */
 vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
 /** 	(D) Evaluation of a polynomial function using Horner's scheme
 * 	@param[in] a	coefficient vector
 *  @param[in] x	vector with input values
 *
 *	@return    vector with output values
 */
 vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
--- a/ex5/ex5_4/main.cpp
+++ b/ex5/ex5_4/main.cpp
@ -0,0 +1,84 @@
 #include "benchmark_tests.h"
 #include <iostream>
 #include <cmath>
 int main()
 {   
    vector<vector<double>> results_scalar;
    results_scalar.push_back(test_A(2000000, pow(10,3)));
    results_scalar.push_back(test_A(1000000, pow(10,4)));
    results_scalar.push_back(test_A(100000, pow(10,5)));
    results_scalar.push_back(test_A(10000, pow(10,6)));
    results_scalar.push_back(test_A(750, pow(10,7)));
    results_scalar.push_back(test_A(125, pow(10,8)));
    vector<vector<double>> results_sum;
    results_sum.push_back(test_A_sum(3000000, pow(10,3)));
    results_sum.push_back(test_A_sum(2000000, pow(10,4)));
    results_sum.push_back(test_A_sum(1000000, pow(10,5)));
    results_sum.push_back(test_A_sum(50000, pow(10,6)));
    results_sum.push_back(test_A_sum(2000, pow(10,7)));
    results_sum.push_back(test_A_sum(250, pow(10,8)));
    test_B(100, 20000, 10000);
    test_C(25, 500, 1000, 1500);
    test_D(100, 100, 1000000);
    cout << endl << "###### Scalar ######" << endl;
    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
    cout << "------------------------------" << endl;
    for (size_t i = 0; i < results_scalar.size(); ++i)
        cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
    cout << endl << "###### Sum ######" << endl;
    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
    cout << "------------------------------" << endl;
    for (size_t i = 0; i < results_sum.size(); ++i)
        cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
    // ###### Scalar ######
    // Timing  GFLOPS  GiByte/s
    // ------------------------------
    // 3.4e-06 0.54    4.3
    // 4.6e-06 4       32
    // 1.6e-05 12      95
    // 0.0011  1.7     13
    // 0.0097  1.9     15
    // 0.075   2.5     20
    // ###### Sum ######
    // Timing  GFLOPS  GiByte/s
    // ------------------------------
    // 5.5e-06 0.17    1.3
    // 5.4e-06 1.7     14
    // 1.5e-05 6.1     49
    // 0.00013 7.2     57
    // 0.0033  2.8     23
    // 0.032   2.9     23
    // ######### NOT PARALLEL (from exercise sheet 2) #########
    //      Timing  GFLOPS  GiByte/s
    // ----------------------------------
    // (A)  0.038   2.5     20
    // (B)  0.13    2.9     23
    // (C)  0.44    3.2     25
    // (D)  0.19    1.5     12
    return 0;
 }
		`@ -0,0 +1,501 @@`
							`141`
							`261`
							`87`
							`430`
							`258`
							`298`
							`425`
							`120`
							`496`
							`707`
							`244`
							`786`
							`75`
							`394`
							`4`
							`221`
							`2`
							`190`
							`143`
							`269`
							`175`
							`139`
							`599`
							`902`
							`940`
							`222`
							`483`
							`377`
							`524`
							`265`
							`69`
							`437`
							`174`
							`27`
							`955`
							`431`
							`962`
							`763`
							`8`
							`681`
							`706`
							`646`
							`553`
							`219`
							`773`
							`229`
							`371`
							`891`
							`857`
							`403`
							`319`
							`609`
							`911`
							`910`
							`592`
							`333`
							`854`
							`443`
							`905`
							`34`
							`533`
							`717`
							`180`
							`337`
							`188`
							`322`
							`404`
							`549`
							`49`
							`553`
							`275`
							`242`
							`244`
							`155`
							`957`
							`936`
							`819`
							`729`
							`176`
							`361`
							`189`
							`2`
							`317`
							`700`
							`626`
							`544`
							`440`
							`288`
							`502`
							`762`
							`763`
							`577`
							`748`
							`646`
							`124`
							`505`
							`348`
							`93`
							`148`
							`199`
							`673`
							`432`
							`695`
							`257`
							`10`
							`533`
							`280`
							`947`
							`907`
							`393`
							`25`
							`672`
							`838`
							`972`
							`57`
							`451`
							`583`
							`687`
							`720`
							`651`
							`727`
							`374`
							`582`
							`117`
							`58`
							`980`
							`285`
							`595`
							`963`
							`186`
							`194`
							`342`
							`933`
							`391`
							`274`
							`152`
							`398`
							`375`
							`132`
							`436`
							`92`
							`615`
							`11`
							`574`
							`790`
							`236`
							`449`
							`570`
							`62`
							`497`
							`643`
							`222`
							`838`
							`972`
							`847`
							`506`
							`279`
							`747`
							`237`
							`958`
							`621`
							`601`
							`173`
							`91`
							`256`
							`859`
							`912`
							`700`
							`726`
							`230`
							`577`
							`811`
							`404`
							`989`
							`90`
							`321`
							`512`
							`61`
							`726`
							`557`
							`530`
							`830`
							`859`
							`790`
							`318`
							`453`
							`753`
							`110`
							`110`
							`270`
							`525`
							`973`
							`711`
							`312`
							`292`
							`851`
							`912`
							`640`
							`256`
							`89`
							`839`
							`585`
							`949`
							`62`
							`585`
							`286`
							`828`
							`191`
							`443`
							`394`
							`827`
							`677`
							`208`
							`319`
							`134`
							`672`
							`571`
							`170`
							`148`
							`477`
							`909`
							`553`
							`33`
							`54`
							`806`
							`452`
							`383`
							`790`
							`365`
							`533`
							`712`
							`872`
							`329`
							`651`
							`975`
							`76`
							`588`
							`414`
							`310`
							`264`
							`759`
							`996`
							`187`
							`782`
							`196`
							`993`
							`803`
							`425`
							`729`
							`499`
							`809`
							`357`
							`74`
							`591`
							`911`
							`194`
							`433`
							`750`
							`40`
							`947`
							`764`
							`559`
							`184`
							`498`
							`518`
							`995`
							`855`
							`963`
							`679`
							`404`
							`935`
							`480`
							`232`
							`397`
							`706`
							`559`
							`757`
							`996`
							`963`
							`536`
							`964`
							`116`
							`52`
							`305`
							`581`
							`531`
							`902`
							`541`
							`432`
							`543`
							`713`
							`17`
							`801`
							`143`
							`479`
							`257`
							`370`
							`662`
							`170`
							`279`
							`199`
							`196`
							`327`
							`881`
							`472`
							`404`
							`180`
							`969`
							`408`
							`845`
							`616`
							`377`
							`878`
							`785`
							`465`
							`814`
							`899`
							`430`
							`335`
							`597`
							`902`
							`703`
							`378`
							`735`
							`955`
							`543`
							`541`
							`312`
							`72`
							`182`
							`93`
							`464`
							`10`
							`916`
							`643`
							`2`
							`31`
							`209`
							`455`
							`128`
							`9`
							`728`
							`355`
							`781`
							`437`
							`437`
							`50`
							`50`
							`92`
							`595`
							`242`
							`842`
							`858`
							`964`
							`489`
							`221`
							`227`
							`537`
							`763`
							`348`
							`462`
							`640`
							`918`
							`162`
							`716`
							`578`
							`434`
							`885`
							`394`
							`179`
							`634`
							`625`
							`328`
							`803`
							`1000`
							`981`
							`128`
							`233`
							`24`
							`608`
							`111`
							`408`
							`885`
							`549`
							`370`
							`209`
							`441`
							`957`
							`125`
							`471`
							`857`
							`44`
							`692`
							`979`
							`284`
							`134`
							`686`
							`910`
							`611`
							`900`
							`194`
							`755`
							`347`
							`419`
							`156`
							`820`
							`625`
							`739`
							`806`
							`68`
							`951`
							`498`
							`756`
							`743`
							`832`
							`157`
							`458`
							`619`
							`933`
							`836`
							`896`
							`583`
							`583`
							`855`
							`35`
							`886`
							`408`
							`37`
							`747`
							`155`
							`144`
							`606`
							`255`
							`325`
							`402`
							`407`
							`387`
							`610`
							`167`
							`189`
							`95`
							`324`
							`770`
							`235`
							`741`
							`693`
							`825`
							`828`
							`294`
							`310`
							`524`
							`326`
							`832`
							`811`
							`557`
							`263`
							`681`
							`234`
							`457`
							`385`
							`539`
							`992`
							`756`
							`981`
							`235`
							`529`
							`52`
							`757`
							`602`
							`858`
							`989`
							`930`
							`410`
							`1`
							`541`
							`208`
							`220`
							`326`
							`96`
							`748`
							`749`
							`544`
							`339`
							`833`
							`553`
							`958`
							`893`
							`357`
							`547`
							`347`
							`623`
							`797`
							`746`
							`126`
							`823`
							`26`
							`415`
							`732`
							`782`
							`368`