ex5

2025-12-05 23:13:00 +01:00 · 2025-12-05 23:13:00 +01:00 · 95b3017475
commit 95b3017475
parent 0386d99307
28 changed files with 5800 additions and 0 deletions
--- a/ex5/code/Makefile
+++ b/ex5/code/Makefile
@ -0,0 +1,31 @@
+PROGRAM	= main
+
+SOURCES = $(wildcard *.cpp)
+OBJECTS = ${SOURCES:.cpp=.o}
+
+CXX     = g++
+LINKER  = g++
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+
+CXXFLAGS   = -g -flto -O3 -ffast-math -march=native -fopenmp ${WARNINGS}
+LINKFLAGS  = -g -flto -O3 -lopenblas -llapacke -fopenmp
+
+
+all: ${PROGRAM}
+
+# %.o: %.cpp
+# 	${CXX} ${CXXFLAGS} -c $< -o $@
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
+
+clean:
+	rm -f ${OBJECTS} ${PROGRAM}
+	rm out_1.txt
+
+
+run: ${PROGRAM}
+# run: clean ${PROGRAM}
+	./${PROGRAM}
--- a/ex5/code/data_1.txt
+++ b/ex5/code/data_1.txt
@ -0,0 +1,500 @@
+141
+261
+87
+430
+258
+298
+425
+120
+496
+707
+244
+786
+75
+394
+4
+221
+2
+190
+143
+269
+175
+139
+599
+902
+940
+222
+483
+377
+524
+265
+69
+437
+174
+27
+955
+431
+962
+763
+8
+681
+706
+646
+553
+219
+773
+229
+371
+891
+857
+403
+319
+609
+911
+910
+592
+333
+854
+443
+905
+34
+533
+717
+180
+337
+188
+322
+404
+549
+49
+553
+275
+242
+244
+155
+957
+936
+819
+729
+176
+361
+189
+2
+317
+700
+626
+544
+440
+288
+502
+762
+763
+577
+748
+646
+124
+505
+348
+93
+148
+199
+673
+432
+695
+257
+10
+533
+280
+947
+907
+393
+25
+672
+838
+972
+57
+451
+583
+687
+720
+651
+727
+374
+582
+117
+58
+980
+285
+595
+963
+186
+194
+342
+933
+391
+274
+152
+398
+375
+132
+436
+92
+615
+11
+574
+790
+236
+449
+570
+62
+497
+643
+222
+838
+972
+847
+506
+279
+747
+237
+958
+621
+601
+173
+91
+256
+859
+912
+700
+726
+230
+577
+811
+404
+989
+90
+321
+512
+61
+726
+557
+530
+830
+859
+790
+318
+453
+753
+110
+110
+270
+525
+973
+711
+312
+292
+851
+912
+640
+256
+89
+839
+585
+949
+62
+585
+286
+828
+191
+443
+394
+827
+677
+208
+319
+134
+672
+571
+170
+148
+477
+909
+553
+33
+54
+806
+452
+383
+790
+365
+533
+712
+872
+329
+651
+975
+76
+588
+414
+310
+264
+759
+996
+187
+782
+196
+993
+803
+425
+729
+499
+809
+357
+74
+591
+911
+194
+433
+750
+40
+947
+764
+559
+184
+498
+518
+995
+855
+963
+679
+404
+935
+480
+232
+397
+706
+559
+757
+996
+963
+536
+964
+116
+52
+305
+581
+531
+902
+541
+432
+543
+713
+17
+801
+143
+479
+257
+370
+662
+170
+279
+199
+196
+327
+881
+472
+404
+180
+969
+408
+845
+616
+377
+878
+785
+465
+814
+899
+430
+335
+597
+902
+703
+378
+735
+955
+543
+541
+312
+72
+182
+93
+464
+10
+916
+643
+2
+31
+209
+455
+128
+9
+728
+355
+781
+437
+437
+50
+50
+92
+595
+242
+842
+858
+964
+489
+221
+227
+537
+763
+348
+462
+640
+918
+162
+716
+578
+434
+885
+394
+179
+634
+625
+328
+803
+1000
+981
+128
+233
+24
+608
+111
+408
+885
+549
+370
+209
+441
+957
+125
+471
+857
+44
+692
+979
+284
+134
+686
+910
+611
+900
+194
+755
+347
+419
+156
+820
+625
+739
+806
+68
+951
+498
+756
+743
+832
+157
+458
+619
+933
+836
+896
+583
+583
+855
+35
+886
+408
+37
+747
+155
+144
+606
+255
+325
+402
+407
+387
+610
+167
+189
+95
+324
+770
+235
+741
+693
+825
+828
+294
+310
+524
+326
+832
+811
+557
+263
+681
+234
+457
+385
+539
+992
+756
+981
+235
+529
+52
+757
+602
+858
+989
+930
+410
+1
+541
+208
+220
+326
+96
+748
+749
+544
+339
+833
+553
+958
+893
+357
+547
+347
+623
+797
+746
+126
+823
+26
+415
+732
+782
+368
--- a/ex5/code/main.cpp
+++ b/ex5/code/main.cpp
@ -0,0 +1,215 @@
+// clang-tidy *.cpp -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" -- *.cpp
+
+#include "task_2.h"
+#include "task_3.h"
+#include "task_4.h"
+#include "timing.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <iomanip>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+void task_2() {
+    printf("\n\n-------------- Task 2 --------------\n\n");
+
+    int threads = 4;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    // Read vector
+    vector<double> a;
+    read_vector_from_file("data_1.txt", a);
+
+    tic();
+    // min and max
+        // auto [min, max] = min_max_par(a);
+        auto min = *min_element(std::execution::par, a.begin(), a.end());
+        auto max = *max_element(std::execution::par, a.begin(), a.end());
+    // means
+        auto [x,y,z] = means_par(a);
+    // deviation
+        double deviation(0.0);
+        #pragma omp parallel for shared(x,a) reduction(+:deviation)
+        for (long unsigned int i=0; i<a.size(); i++){
+            deviation += pow(x - a.at(i),2);
+        }
+        deviation = sqrt(deviation/static_cast<double>(a.size()));
+    double t = toc();
+
+    printf("Minimum: %f\n", min);
+    printf("Maximum: %f\n", max);
+    printf("Arithmetic: %f\n", x);
+    printf("Geometric: %f\n", y);
+    printf("Harmonic: %f\n", z);
+    printf("Deviation: %f\n", deviation);
+    printf("Execution time: %f\n", t);
+
+    // write results to file
+    vector<double> b = {min,max,x,y,z,deviation};
+    write_vector_to_file("out_1.txt", b);
+}
+
+void task_3() {
+    printf("\n\n-------------- Task 3 --------------\n\n");
+    
+    int threads = 4;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    // #####################################
+    // single_goldbach(k)
+    int k = 694;
+    printf("single_goldbach(k = %d) = %d\n", k, single_goldbach_par(k));
+
+    // Prints decompositions
+    print_decomps(k);
+    
+    // count_goldbach(n)
+    // printf("\nNOTE: For n=2'000'000 it will take ~30 seconds.\n");
+    for (int n : {10'000, 100'000, 400'000, 1'000'000, 2'000'000/*, 10'000'000*/}) {
+        tic();
+        vector<int> counts = count_goldbach_par(n);
+        double sec = toc();
+
+        auto max = max_element(counts.begin(), counts.end());
+        printf("count_goldbach(n = %d): k = %ld, decompositions = %d, time elapsed: %f milliseconds\n", n, max-counts.begin(), *max, sec*1000);
+    }
+    printf("Should be:               k = 9240, 99330, 390390, 990990, 1981980, 9699690\n");
+    printf("            decompositions =  329,  2168,   7094,  15594,   27988,  124180\n\n");
+}
+
+void task_4() {
+    printf("\n\n-------------- Task 4 --------------\n\n");
+
+    int threads = 32;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    size_t M, N, L, p, NLOOPS;
+
+    { //      Matrix-Vector product
+    printf("----- Benchmark (B) -----\n");
+    // Initialization
+        M = 8'000;
+        N = 12'000;
+        NLOOPS = 30;
+        auto [A,x] = init_B(M,N);
+    // Benchmark
+        tic();
+        benchmark_B(A, x, NLOOPS, false);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*N + M + N;
+        size_t flops  = 2 * M * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //     Matrix-Matrix product
+    printf("----- Benchmark (C) -----\n");
+    // Initialization
+        M = 1'000;
+        N = 2'000;
+        L = 500;
+        NLOOPS = 20;
+        auto [A,B] = init_C(M,N,L);
+    // Benchmark
+        tic();
+        benchmark_C(A, B, L, NLOOPS, false);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*L + L*N + M*N;
+        size_t flops  = M * 2*L * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //      Polynomial evaluation
+    printf("----- Benchmark (D) -----\n");
+    // Initialization
+        N = 1'000'000;
+        p = 200;
+        NLOOPS = 20;
+        auto [x,a] = init_D(N,p);
+    // Benchmark
+        tic();
+        benchmark_D(x, a, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings  and Performance
+        size_t memory = 2.0 * N;
+        size_t flops  = 2.0 * N * p;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+
+
+    // Timing
+    NLOOPS = 50;
+    int K=9, T=16;
+    vector<double> speedup_sum((K-3+1)*T), speedup_scalar((K-3+1)*T);
+    for (int k=0; k<(K-3+1); ++k) {
+        N = pow(10,k);
+        auto [x,y] = init_A(N);
+        for (int t=0; t<T; t++) {
+            omp_set_num_threads(t+1);
+
+            tic();
+            benchmark_summation(x, NLOOPS);
+            speedup_sum[k*T+t] = toc() / NLOOPS;
+            
+            tic();
+            benchmark_A(x, y, NLOOPS, false);
+            speedup_scalar[k*T+t] = toc() / NLOOPS;
+        }
+    }
+
+    // Calculating speedup
+    for (int k=0; k<(K-3+1); ++k) {
+        double t0 = speedup_sum[k*T];
+        double t00 = speedup_scalar[k*T];
+        for (int t=0; t<T; t++){
+            speedup_sum[k*T+t] = t0/speedup_sum[k*T+t];
+            speedup_scalar[k*T+t] = t00/speedup_scalar[k*T+t];
+        }
+    }
+
+    // Printing tables
+    cout << fixed << setprecision(4);
+    cout << "\n\nSpeedup: summation" << endl;
+    cout << "k \\ threads |  ";
+    for (int t=0; t<T; t++) {cout << setw(2) << t+1 << "  |  ";}
+    cout << endl;
+    for (int k=3; k<K+1; ++k) {
+        cout << "     " << k << "      |";
+        for (int t=0; t<T; t++) {
+            cout << speedup_sum[(k-3)*T+t] << "|";
+        }
+    cout << endl;
+    }
+
+    cout << "\n\nSpeedup: scalar" << endl;
+    cout << "k \\ threads |  ";
+    for (int t=0; t<T; t++) {cout << setw(2) << t+1 << "  |  ";}
+    cout << endl;
+    for (int k=3; k<K+1; ++k) {
+        cout << "     " << k << "      |";
+        for (int t=0; t<T; t++) {
+            cout << speedup_scalar[(k-3)*T+t] << "|";
+        }
+    cout << endl;
+    }
+}
+
+int main() {
+    task_2();
+    task_3();
+    task_4();
+
+    return 0;
+}
--- a/ex5/code/mayer_primes.h
+++ b/ex5/code/mayer_primes.h
@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstring> //memset
+#include <vector>
+//using namespace std;
+
+/** \brief Determines all prime numbers in interval [2, @p max].
+ *
+ *  The sieve of Eratosthenes is used.
+ *
+ *  The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
+ *
+ * \param[in]   max end of interval for the prime number search.
+ * \return  vector of prime numbers @f$2,3,5, ..., p<=max @f$.
+ *
+ * \copyright
+ * Copyright (c) 2008 Florian Mayer          (adapted by Gundolf Haase 2018)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+template <class T>
+std::vector<T> get_primes(T max)
+{
+    std::vector<T> primes;
+    char *sieve;
+    sieve = new char[max / 8 + 1];
+    // Fill sieve with 1
+    memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
+    for (T x = 2; x <= max; x++)
+    {
+        if (sieve[x / 8] & (0x01 << (x % 8))) {
+            primes.push_back(x);
+            // Is prime. Mark multiplicates.
+            for (T j = 2 * x; j <= max; j += x)
+            {
+                sieve[j / 8] &= ~(0x01 << (j % 8));
+			}
+        }
+	}
+    delete[] sieve;
+    return primes;
+}
+
+//---------------------------------------------------------------
+//int main()        // by  Florian Mayer
+//{g++ -O3  -std=c++14 -fopenmp main.cpp && ./a.out
+//    vector<unsigned long> primes;
+//    primes = get_primes(10000000);
+//    // return 0;
+//    // Print out result.
+//    vector<unsigned long>::iterator it;
+//    for(it=primes.begin(); it < primes.end(); it++)
+//        cout << *it << " ";
+//
+//    cout << endl;
+//    return 0;
+//}
--- a/ex5/code/out_1.txt
+++ b/ex5/code/out_1.txt
@ -0,0 +1,6 @@
+1
+1000
+498.184
+364.412
+95.6857
+287.905
--- a/ex5/code/task_2.cpp
+++ b/ex5/code/task_2.cpp
@ -0,0 +1,93 @@
+#include "task_2.h"
+#include <cassert>          // assert
+#include <cmath>
+#include <fstream>
+#include <omp.h>
+
+tuple<double, double> min_max_par(const vector<double> &v) {
+    int min_val = v[0];
+    int max_val = v[0];
+
+    #pragma omp parallel for reduction(min:min_val) reduction(max:max_val)
+    for (size_t i = 0; i < v.size(); ++i) {
+        if (v[i] < min_val) min_val = v[i];
+        if (v[i] > max_val) max_val = v[i];
+    }
+
+    return make_tuple(min_val,max_val);
+}
+
+
+tuple<double, double, double> means_par(const vector<double>& v){
+    size_t n = v.size();
+    double sum = 0;
+    double logsum = 0;
+    double invsum = 0;
+
+    #pragma omp parallel for shared(v,n) reduction(+:sum, logsum, invsum)
+    for (size_t i = 0; i<n; ++i){
+        sum += v[i];
+        logsum += log(v[i]);
+        invsum += 1.0/v[i];
+    }
+
+    double arith = sum / static_cast<double>(n);
+    double geo = exp(1.0/static_cast<double>(n) * logsum);
+    double harm = static_cast<double>(n) / invsum;
+    return make_tuple(arith, geo, harm);
+}
+
+void fill_vector(istream& istr, vector<double>& v)
+{
+    double d=0;
+    while ( istr >> d) v.push_back(d); // Einlesen
+    if (!istr.eof())
+    { // Fehlerbehandlung
+        cout << " Error handling \n";
+        if ( istr.bad() )  throw runtime_error("Schwerer Fehler in istr");
+        if ( istr.fail() )   // Versuch des Aufraeumens
+        {
+            cout << " Failed in reading all data.\n";
+            istr.clear();
+        }
+    }
+    v.shrink_to_fit();                 // C++11
+    return;
+}
+
+
+void read_vector_from_file(const string& file_name, vector<double>& v)
+{
+    ifstream fin(file_name);           // Oeffne das File im ASCII-Modus
+    if( fin.is_open() )                // File gefunden:
+    {
+       v.clear();                      // Vektor leeren
+       fill_vector(fin, v);
+    }
+    else                               // File nicht gefunden:
+    {
+        cout << "\nFile " << file_name << " has not been found.\n\n" ;
+        assert( fin.is_open() && "File not found." );       // exeption handling for the poor programmer
+    }
+
+ return;
+}
+
+void write_vector_to_file(const string& file_name, const vector<double>& v)
+{
+    ofstream fout(file_name);          // Oeffne das File im ASCII-Modus
+    if( fout.is_open() )
+    {
+       for (size_t k=0; k<v.size(); ++k)
+       {
+          fout << v.at(k) << endl;
+       }
+    }
+    else
+    {
+        cout << "\nFile " << file_name << " has not been opened.\n\n" ;
+        assert( fout.is_open() && "File not opened."  );         // exeption handling for the poor programmer
+    }
+
+ return;
+}
--- a/ex5/code/task_2.h
+++ b/ex5/code/task_2.h
@ -0,0 +1,38 @@
+#pragma once
+#include <iostream>
+#include <vector>
+using namespace std;
+
+tuple<double, double> min_max_par(const vector<double> &v);
+tuple<double, double, double> means_par(const vector<double>& v);
+
+/**
+  This function opens the ASCII-file named @p file_name and reads the
+  double data into the C++ vector @p v.
+  If the file  @p file_name does not exist then the code stops with an appropriate message.
+  @param[in]    file_name   name of the ASCII-file
+  @param[out]   v           C++ vector with double values
+*/
+
+void read_vector_from_file(const string& file_name, vector<double>& v);
+
+
+/**
+  This function opens the ASCII-file named @p file_name and rewrites its with the
+  double data from the C++ vector @p v.
+  If there are problems in opening/generating file  @p file_name
+  then the code stops with an appropriate message.
+  @param[in]    file_name   name of the ASCII-file
+  @param[in]    v           C++ vector with double values
+*/
+
+void write_vector_to_file(const string& file_name, const vector<double>& v);
+
+/**
+  Fills the double-vector @p v with data from an input stream @p istr until this input stream
+  ends regularily. The vector is cleared and its memory is automatically allocated.
+  @param[in]    istr        input stream
+  @param[out]    v           C++ vector with double values
+  @warning     An exception is thrown in case of wrong data format or corrupted data.
+*/
+void fill_vector(istream& istr, vector<double>& v);
--- a/ex5/code/task_3.cpp
+++ b/ex5/code/task_3.cpp
@ -0,0 +1,73 @@
+#include "task_3.h"
+#include "mayer_primes.h"
+#include "timing.h"
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <omp.h>
+#include <vector>
+using namespace std;
+
+
+int single_goldbach_par(int k) {
+    const vector<int> primes = get_primes(k);
+    int count = 0;
+
+    #pragma omp parallel for reduction(+:count)
+    for (size_t i = 0; i < primes.size(); i++) {
+        for (size_t j = i; j < primes.size(); j++) {
+            if (primes[i] + primes[j] == k) {
+                count++;
+            }
+        }
+    }
+
+    return count;
+}
+
+
+vector<int> count_goldbach_par(int n) {
+    const vector<int> primes = get_primes(n);
+    vector<int> counts(n+1);
+
+    #pragma omp parallel reduction(VecAdd:counts)
+    // #pragma omp parallel
+    {
+        vector<int> local_counts(n+1, 0);
+
+            #pragma omp for
+        for (size_t i = 1; i < primes.size(); i++) {
+            for (size_t j = i; j < primes.size(); j++) {
+                int sum = primes[i] + primes[j];
+                if (sum <= n) {
+                    local_counts[sum]++;
+                }
+            }
+        }
+
+        counts += local_counts;
+        // #pragma omp critical
+        // {
+        //     for(int k=0; k<n+1; k++){
+        //     counts[k] += local_counts[k];
+        //     }
+        // }
+    }
+    return counts;
+}
+
+
+void print_decomps(int k) {
+    const vector<int> primes = get_primes(k);
+    cout << "\nDecompositions for k = " << k << ": ";
+
+    for (size_t i = 0; i < primes.size(); i++) {
+        for (size_t j = i; j < primes.size(); j++) {
+            if (primes[i] + primes[j] == k) {
+                cout << primes[i] << " + " << primes[j] << ", ";
+            }
+        }
+    }
+    cout << endl;
+}
--- a/ex5/code/task_3.h
+++ b/ex5/code/task_3.h
@ -0,0 +1,33 @@
+#pragma once
+#include <cassert>
+#include <vector>
+using namespace std;
+
+
+// Counts number of possible decompositions with 2 primes that sum up to k.
+int single_goldbach_par(int k);
+
+// Counts number of possible decompositions with 2 primes that sum up to k for all even numbers k \in {4,...,n}.
+vector<int> count_goldbach_par(int n);
+
+// Prints all decompositions of k.
+void print_decomps(int k);
+
+
+/** 	 Vector @p b adds its elements to vector @p a .
+	@param[in] a	vector
+	@param[in] b	vector
+	@return 	     a+=b componentwise
+*/
+template<class T>
+std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
+{
+    assert(a.size()==b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+        a[k] += b[k];
+    }
+    return a;
+}
+
+#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
+ initializer (omp_priv=omp_orig)
--- a/ex5/code/task_4.cpp
+++ b/ex5/code/task_4.cpp
@ -0,0 +1,232 @@
+#include "task_4.h"
+#include "timing.h"
+#include <cassert>
+#include <cblas.h>               // cBLAS Library
+#include <iostream>
+#include <vector>
+using namespace std;
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
+    size_t const N = x.size();
+    size_t const M = A.size() / N;
+    vector<double> b(M);
+
+    #pragma omp parallel for shared(A,x,N,M,b)
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            b[i] += A[i*N + j] * x[j];
+        }
+    }
+    return b;
+}
+
+
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
+    size_t const L = A.size() / M;
+    size_t const N = B.size() / L;
+    vector<double> C(M*N,0);
+
+    #pragma omp parallel for shared(A,B,M,L,N,C)
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t k = 0; k < L; ++k) {
+            for (size_t j = 0; j < N; ++j) {
+                C[i*N + j] += A[i*L + k] * B[k*N + j];
+            }
+        }
+    }
+    return C;
+}
+
+
+vector<double> poly(vector<double> const &x, vector<double> const &a) {
+    size_t N = x.size();
+    size_t p = a.size();
+    vector<double> y(N);
+
+    #pragma omp parallel for shared(x,a,N,p,y)
+    for (size_t i = 0; i < N; ++i) {
+        y[i] = a[p];
+        for (size_t k = 1; k < p; ++k) {
+            y[i] = y[i]*x[i] + a[p-k];
+        }
+    }
+    return y;
+}
+
+double scalar(vector<double> const &x, vector<double> const &y) {
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+
+    #pragma omp parallel for shared(x,y,N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+double summation(vector<double> const &x){
+    size_t N = x.size();
+    double sum = 0.0;
+
+    #pragma omp parallel for shared(x,N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i];
+    }
+
+    return sum;
+}
+
+
+
+// ##########################################################################
+
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
+    printf("Memory allocated  : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
+    printf("Duration per loop : %.3f sec\n", sec);
+    printf("GFLOPS            : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
+    printf("GiByte/s          : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
+}
+
+tuple<vector<double>, vector<double>> init_A(size_t N) {
+    vector<double> x(N), y(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+        y[i] = 1.0 / x[i];
+    }
+    return make_tuple(x, y);
+}
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+
+    double s(0.0), sum(0.0);
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = scalar(x, y);
+            sum += s;
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = cblas_ddot(N, x.data(), 1, y.data(), 1);
+            sum += s;
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
+    vector<double> A(M*N), x(N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            A[i*N + j] = (i+j)%219 + 1.0;
+        }
+    }
+    for (size_t j = 0; j < N; ++j) {
+        x[j] = 1.0/A[17*N + j];
+    }
+    return make_tuple(A, x);
+}
+
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+    size_t M = A.size() / N;
+    vector<double> b(M);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            b = matrix_vec(A,x);
+            sum += b[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
+            sum += b[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
+    vector<double> A(M*L), B(L*N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < L; ++j) {
+            A[i*L + j] = (i+j)%219 + 1.0;
+        }
+    }
+    // B chosen such that C[0,17]=L
+    // so B[i,17] = 1/A[0,i]
+    for (size_t i = 0; i < L; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (j==17) {
+                B[i*N + 17] = 1.0/A[i];
+            } else {
+                B[i*N + j] = (i+j)%219 + 1.0;
+            }
+        }
+    }
+    return make_tuple(A, B);
+}
+
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
+    size_t M = A.size() / L;
+    size_t N = B.size() / L;
+    vector<double> C(M*N);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            C = matrix_matrix(A,B,M);
+            sum += C[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+            sum += C[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != L*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
+    // x_i = i/N for i=0,...,N-1
+    // a_j = 1   for j=0,...,p-1
+    vector<double> x(N), a(p);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = static_cast<double>(i) / N;
+    }
+    for (size_t j = 0; j < p; ++j) {
+        a[j] = 1.0;
+    }
+    return make_tuple(x, a);
+}
+
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
+    size_t N = x.size();
+    vector<double> y(N);
+    double sum(0.0);
+
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        y = poly(x,a);
+        sum += y[0];
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != NLOOPS) {printf("  !!   W R O N G  result sum = %f  !!\n", sum);}
+}
+
+void benchmark_summation(vector<double> const &x, size_t NLOOPS) {
+    double s(0.0), sum(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        s = summation(x);
+        sum += s;
+    }
+}
--- a/ex5/code/task_4.h
+++ b/ex5/code/task_4.h
@ -0,0 +1,22 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
+vector<double> poly(vector<double> const &x, vector<double> const &a);
+double scalar(vector<double> const &x, vector<double> const &y);
+double summation(vector<double> const &x);
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
+tuple<vector<double>, vector<double>> init_A(size_t N);
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
+
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
+void benchmark_summation(vector<double> const &x, size_t NLOOPS);
--- a/ex5/code/timing.h
+++ b/ex5/code/timing.h
@ -0,0 +1,51 @@
+//
+//    Gundolf Haase, Oct 18 2024
+//
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+//using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining can be nested and the recent time point is stored on top of the stack.
+ * 
+ *  @return recent time point
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time point from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}