ex5

2025-12-05 23:13:00 +01:00 · 2025-12-05 23:13:00 +01:00 · 95b3017475
commit 95b3017475
parent 0386d99307
28 changed files with 5800 additions and 0 deletions
--- a/ex5/code/Makefile
+++ b/ex5/code/Makefile
@ -0,0 +1,31 @@
+PROGRAM	= main
+
+SOURCES = $(wildcard *.cpp)
+OBJECTS = ${SOURCES:.cpp=.o}
+
+CXX     = g++
+LINKER  = g++
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+
+CXXFLAGS   = -g -flto -O3 -ffast-math -march=native -fopenmp ${WARNINGS}
+LINKFLAGS  = -g -flto -O3 -lopenblas -llapacke -fopenmp
+
+
+all: ${PROGRAM}
+
+# %.o: %.cpp
+# 	${CXX} ${CXXFLAGS} -c $< -o $@
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
+
+clean:
+	rm -f ${OBJECTS} ${PROGRAM}
+	rm out_1.txt
+
+
+run: ${PROGRAM}
+# run: clean ${PROGRAM}
+	./${PROGRAM}
--- a/ex5/code/data_1.txt
+++ b/ex5/code/data_1.txt
@ -0,0 +1,500 @@
+141
+261
+87
+430
+258
+298
+425
+120
+496
+707
+244
+786
+75
+394
+4
+221
+2
+190
+143
+269
+175
+139
+599
+902
+940
+222
+483
+377
+524
+265
+69
+437
+174
+27
+955
+431
+962
+763
+8
+681
+706
+646
+553
+219
+773
+229
+371
+891
+857
+403
+319
+609
+911
+910
+592
+333
+854
+443
+905
+34
+533
+717
+180
+337
+188
+322
+404
+549
+49
+553
+275
+242
+244
+155
+957
+936
+819
+729
+176
+361
+189
+2
+317
+700
+626
+544
+440
+288
+502
+762
+763
+577
+748
+646
+124
+505
+348
+93
+148
+199
+673
+432
+695
+257
+10
+533
+280
+947
+907
+393
+25
+672
+838
+972
+57
+451
+583
+687
+720
+651
+727
+374
+582
+117
+58
+980
+285
+595
+963
+186
+194
+342
+933
+391
+274
+152
+398
+375
+132
+436
+92
+615
+11
+574
+790
+236
+449
+570
+62
+497
+643
+222
+838
+972
+847
+506
+279
+747
+237
+958
+621
+601
+173
+91
+256
+859
+912
+700
+726
+230
+577
+811
+404
+989
+90
+321
+512
+61
+726
+557
+530
+830
+859
+790
+318
+453
+753
+110
+110
+270
+525
+973
+711
+312
+292
+851
+912
+640
+256
+89
+839
+585
+949
+62
+585
+286
+828
+191
+443
+394
+827
+677
+208
+319
+134
+672
+571
+170
+148
+477
+909
+553
+33
+54
+806
+452
+383
+790
+365
+533
+712
+872
+329
+651
+975
+76
+588
+414
+310
+264
+759
+996
+187
+782
+196
+993
+803
+425
+729
+499
+809
+357
+74
+591
+911
+194
+433
+750
+40
+947
+764
+559
+184
+498
+518
+995
+855
+963
+679
+404
+935
+480
+232
+397
+706
+559
+757
+996
+963
+536
+964
+116
+52
+305
+581
+531
+902
+541
+432
+543
+713
+17
+801
+143
+479
+257
+370
+662
+170
+279
+199
+196
+327
+881
+472
+404
+180
+969
+408
+845
+616
+377
+878
+785
+465
+814
+899
+430
+335
+597
+902
+703
+378
+735
+955
+543
+541
+312
+72
+182
+93
+464
+10
+916
+643
+2
+31
+209
+455
+128
+9
+728
+355
+781
+437
+437
+50
+50
+92
+595
+242
+842
+858
+964
+489
+221
+227
+537
+763
+348
+462
+640
+918
+162
+716
+578
+434
+885
+394
+179
+634
+625
+328
+803
+1000
+981
+128
+233
+24
+608
+111
+408
+885
+549
+370
+209
+441
+957
+125
+471
+857
+44
+692
+979
+284
+134
+686
+910
+611
+900
+194
+755
+347
+419
+156
+820
+625
+739
+806
+68
+951
+498
+756
+743
+832
+157
+458
+619
+933
+836
+896
+583
+583
+855
+35
+886
+408
+37
+747
+155
+144
+606
+255
+325
+402
+407
+387
+610
+167
+189
+95
+324
+770
+235
+741
+693
+825
+828
+294
+310
+524
+326
+832
+811
+557
+263
+681
+234
+457
+385
+539
+992
+756
+981
+235
+529
+52
+757
+602
+858
+989
+930
+410
+1
+541
+208
+220
+326
+96
+748
+749
+544
+339
+833
+553
+958
+893
+357
+547
+347
+623
+797
+746
+126
+823
+26
+415
+732
+782
+368
--- a/ex5/code/main.cpp
+++ b/ex5/code/main.cpp
@ -0,0 +1,215 @@
+// clang-tidy *.cpp -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" -- *.cpp
+
+#include "task_2.h"
+#include "task_3.h"
+#include "task_4.h"
+#include "timing.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <iomanip>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+void task_2() {
+    printf("\n\n-------------- Task 2 --------------\n\n");
+
+    int threads = 4;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    // Read vector
+    vector<double> a;
+    read_vector_from_file("data_1.txt", a);
+
+    tic();
+    // min and max
+        // auto [min, max] = min_max_par(a);
+        auto min = *min_element(std::execution::par, a.begin(), a.end());
+        auto max = *max_element(std::execution::par, a.begin(), a.end());
+    // means
+        auto [x,y,z] = means_par(a);
+    // deviation
+        double deviation(0.0);
+        #pragma omp parallel for shared(x,a) reduction(+:deviation)
+        for (long unsigned int i=0; i<a.size(); i++){
+            deviation += pow(x - a.at(i),2);
+        }
+        deviation = sqrt(deviation/static_cast<double>(a.size()));
+    double t = toc();
+
+    printf("Minimum: %f\n", min);
+    printf("Maximum: %f\n", max);
+    printf("Arithmetic: %f\n", x);
+    printf("Geometric: %f\n", y);
+    printf("Harmonic: %f\n", z);
+    printf("Deviation: %f\n", deviation);
+    printf("Execution time: %f\n", t);
+
+    // write results to file
+    vector<double> b = {min,max,x,y,z,deviation};
+    write_vector_to_file("out_1.txt", b);
+}
+
+void task_3() {
+    printf("\n\n-------------- Task 3 --------------\n\n");
+    
+    int threads = 4;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    // #####################################
+    // single_goldbach(k)
+    int k = 694;
+    printf("single_goldbach(k = %d) = %d\n", k, single_goldbach_par(k));
+
+    // Prints decompositions
+    print_decomps(k);
+    
+    // count_goldbach(n)
+    // printf("\nNOTE: For n=2'000'000 it will take ~30 seconds.\n");
+    for (int n : {10'000, 100'000, 400'000, 1'000'000, 2'000'000/*, 10'000'000*/}) {
+        tic();
+        vector<int> counts = count_goldbach_par(n);
+        double sec = toc();
+
+        auto max = max_element(counts.begin(), counts.end());
+        printf("count_goldbach(n = %d): k = %ld, decompositions = %d, time elapsed: %f milliseconds\n", n, max-counts.begin(), *max, sec*1000);
+    }
+    printf("Should be:               k = 9240, 99330, 390390, 990990, 1981980, 9699690\n");
+    printf("            decompositions =  329,  2168,   7094,  15594,   27988,  124180\n\n");
+}
+
+void task_4() {
+    printf("\n\n-------------- Task 4 --------------\n\n");
+
+    int threads = 32;
+    omp_set_num_threads(threads);
+    cout << omp_get_max_threads() << " threads have been started." << endl;
+
+    size_t M, N, L, p, NLOOPS;
+
+    { //      Matrix-Vector product
+    printf("----- Benchmark (B) -----\n");
+    // Initialization
+        M = 8'000;
+        N = 12'000;
+        NLOOPS = 30;
+        auto [A,x] = init_B(M,N);
+    // Benchmark
+        tic();
+        benchmark_B(A, x, NLOOPS, false);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*N + M + N;
+        size_t flops  = 2 * M * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //     Matrix-Matrix product
+    printf("----- Benchmark (C) -----\n");
+    // Initialization
+        M = 1'000;
+        N = 2'000;
+        L = 500;
+        NLOOPS = 20;
+        auto [A,B] = init_C(M,N,L);
+    // Benchmark
+        tic();
+        benchmark_C(A, B, L, NLOOPS, false);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*L + L*N + M*N;
+        size_t flops  = M * 2*L * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //      Polynomial evaluation
+    printf("----- Benchmark (D) -----\n");
+    // Initialization
+        N = 1'000'000;
+        p = 200;
+        NLOOPS = 20;
+        auto [x,a] = init_D(N,p);
+    // Benchmark
+        tic();
+        benchmark_D(x, a, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings  and Performance
+        size_t memory = 2.0 * N;
+        size_t flops  = 2.0 * N * p;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+
+
+    // Timing
+    NLOOPS = 50;
+    int K=9, T=16;
+    vector<double> speedup_sum((K-3+1)*T), speedup_scalar((K-3+1)*T);
+    for (int k=0; k<(K-3+1); ++k) {
+        N = pow(10,k);
+        auto [x,y] = init_A(N);
+        for (int t=0; t<T; t++) {
+            omp_set_num_threads(t+1);
+
+            tic();
+            benchmark_summation(x, NLOOPS);
+            speedup_sum[k*T+t] = toc() / NLOOPS;
+            
+            tic();
+            benchmark_A(x, y, NLOOPS, false);
+            speedup_scalar[k*T+t] = toc() / NLOOPS;
+        }
+    }
+
+    // Calculating speedup
+    for (int k=0; k<(K-3+1); ++k) {
+        double t0 = speedup_sum[k*T];
+        double t00 = speedup_scalar[k*T];
+        for (int t=0; t<T; t++){
+            speedup_sum[k*T+t] = t0/speedup_sum[k*T+t];
+            speedup_scalar[k*T+t] = t00/speedup_scalar[k*T+t];
+        }
+    }
+
+    // Printing tables
+    cout << fixed << setprecision(4);
+    cout << "\n\nSpeedup: summation" << endl;
+    cout << "k \\ threads |  ";
+    for (int t=0; t<T; t++) {cout << setw(2) << t+1 << "  |  ";}
+    cout << endl;
+    for (int k=3; k<K+1; ++k) {
+        cout << "     " << k << "      |";
+        for (int t=0; t<T; t++) {
+            cout << speedup_sum[(k-3)*T+t] << "|";
+        }
+    cout << endl;
+    }
+
+    cout << "\n\nSpeedup: scalar" << endl;
+    cout << "k \\ threads |  ";
+    for (int t=0; t<T; t++) {cout << setw(2) << t+1 << "  |  ";}
+    cout << endl;
+    for (int k=3; k<K+1; ++k) {
+        cout << "     " << k << "      |";
+        for (int t=0; t<T; t++) {
+            cout << speedup_scalar[(k-3)*T+t] << "|";
+        }
+    cout << endl;
+    }
+}
+
+int main() {
+    task_2();
+    task_3();
+    task_4();
+
+    return 0;
+}
--- a/ex5/code/mayer_primes.h
+++ b/ex5/code/mayer_primes.h
@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstring> //memset
+#include <vector>
+//using namespace std;
+
+/** \brief Determines all prime numbers in interval [2, @p max].
+ *
+ *  The sieve of Eratosthenes is used.
+ *
+ *  The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
+ *
+ * \param[in]   max end of interval for the prime number search.
+ * \return  vector of prime numbers @f$2,3,5, ..., p<=max @f$.
+ *
+ * \copyright
+ * Copyright (c) 2008 Florian Mayer          (adapted by Gundolf Haase 2018)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+template <class T>
+std::vector<T> get_primes(T max)
+{
+    std::vector<T> primes;
+    char *sieve;
+    sieve = new char[max / 8 + 1];
+    // Fill sieve with 1
+    memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
+    for (T x = 2; x <= max; x++)
+    {
+        if (sieve[x / 8] & (0x01 << (x % 8))) {
+            primes.push_back(x);
+            // Is prime. Mark multiplicates.
+            for (T j = 2 * x; j <= max; j += x)
+            {
+                sieve[j / 8] &= ~(0x01 << (j % 8));
+			}
+        }
+	}
+    delete[] sieve;
+    return primes;
+}
+
+//---------------------------------------------------------------
+//int main()        // by  Florian Mayer
+//{g++ -O3  -std=c++14 -fopenmp main.cpp && ./a.out
+//    vector<unsigned long> primes;
+//    primes = get_primes(10000000);
+//    // return 0;
+//    // Print out result.
+//    vector<unsigned long>::iterator it;
+//    for(it=primes.begin(); it < primes.end(); it++)
+//        cout << *it << " ";
+//
+//    cout << endl;
+//    return 0;
+//}
--- a/ex5/code/out_1.txt
+++ b/ex5/code/out_1.txt
@ -0,0 +1,6 @@
+1
+1000
+498.184
+364.412
+95.6857
+287.905
--- a/ex5/code/task_2.cpp
+++ b/ex5/code/task_2.cpp
@ -0,0 +1,93 @@
+#include "task_2.h"
+#include <cassert>          // assert
+#include <cmath>
+#include <fstream>
+#include <omp.h>
+
+tuple<double, double> min_max_par(const vector<double> &v) {
+    int min_val = v[0];
+    int max_val = v[0];
+
+    #pragma omp parallel for reduction(min:min_val) reduction(max:max_val)
+    for (size_t i = 0; i < v.size(); ++i) {
+        if (v[i] < min_val) min_val = v[i];
+        if (v[i] > max_val) max_val = v[i];
+    }
+
+    return make_tuple(min_val,max_val);
+}
+
+
+tuple<double, double, double> means_par(const vector<double>& v){
+    size_t n = v.size();
+    double sum = 0;
+    double logsum = 0;
+    double invsum = 0;
+
+    #pragma omp parallel for shared(v,n) reduction(+:sum, logsum, invsum)
+    for (size_t i = 0; i<n; ++i){
+        sum += v[i];
+        logsum += log(v[i]);
+        invsum += 1.0/v[i];
+    }
+
+    double arith = sum / static_cast<double>(n);
+    double geo = exp(1.0/static_cast<double>(n) * logsum);
+    double harm = static_cast<double>(n) / invsum;
+    return make_tuple(arith, geo, harm);
+}
+
+void fill_vector(istream& istr, vector<double>& v)
+{
+    double d=0;
+    while ( istr >> d) v.push_back(d); // Einlesen
+    if (!istr.eof())
+    { // Fehlerbehandlung
+        cout << " Error handling \n";
+        if ( istr.bad() )  throw runtime_error("Schwerer Fehler in istr");
+        if ( istr.fail() )   // Versuch des Aufraeumens
+        {
+            cout << " Failed in reading all data.\n";
+            istr.clear();
+        }
+    }
+    v.shrink_to_fit();                 // C++11
+    return;
+}
+
+
+void read_vector_from_file(const string& file_name, vector<double>& v)
+{
+    ifstream fin(file_name);           // Oeffne das File im ASCII-Modus
+    if( fin.is_open() )                // File gefunden:
+    {
+       v.clear();                      // Vektor leeren
+       fill_vector(fin, v);
+    }
+    else                               // File nicht gefunden:
+    {
+        cout << "\nFile " << file_name << " has not been found.\n\n" ;
+        assert( fin.is_open() && "File not found." );       // exeption handling for the poor programmer
+    }
+
+ return;
+}
+
+void write_vector_to_file(const string& file_name, const vector<double>& v)
+{
+    ofstream fout(file_name);          // Oeffne das File im ASCII-Modus
+    if( fout.is_open() )
+    {
+       for (size_t k=0; k<v.size(); ++k)
+       {
+          fout << v.at(k) << endl;
+       }
+    }
+    else
+    {
+        cout << "\nFile " << file_name << " has not been opened.\n\n" ;
+        assert( fout.is_open() && "File not opened."  );         // exeption handling for the poor programmer
+    }
+
+ return;
+}
--- a/ex5/code/task_2.h
+++ b/ex5/code/task_2.h
@ -0,0 +1,38 @@
+#pragma once
+#include <iostream>
+#include <vector>
+using namespace std;
+
+tuple<double, double> min_max_par(const vector<double> &v);
+tuple<double, double, double> means_par(const vector<double>& v);
+
+/**
+  This function opens the ASCII-file named @p file_name and reads the
+  double data into the C++ vector @p v.
+  If the file  @p file_name does not exist then the code stops with an appropriate message.
+  @param[in]    file_name   name of the ASCII-file
+  @param[out]   v           C++ vector with double values
+*/
+
+void read_vector_from_file(const string& file_name, vector<double>& v);
+
+
+/**
+  This function opens the ASCII-file named @p file_name and rewrites its with the
+  double data from the C++ vector @p v.
+  If there are problems in opening/generating file  @p file_name
+  then the code stops with an appropriate message.
+  @param[in]    file_name   name of the ASCII-file
+  @param[in]    v           C++ vector with double values
+*/
+
+void write_vector_to_file(const string& file_name, const vector<double>& v);
+
+/**
+  Fills the double-vector @p v with data from an input stream @p istr until this input stream
+  ends regularily. The vector is cleared and its memory is automatically allocated.
+  @param[in]    istr        input stream
+  @param[out]    v           C++ vector with double values
+  @warning     An exception is thrown in case of wrong data format or corrupted data.
+*/
+void fill_vector(istream& istr, vector<double>& v);
--- a/ex5/code/task_3.cpp
+++ b/ex5/code/task_3.cpp
@ -0,0 +1,73 @@
+#include "task_3.h"
+#include "mayer_primes.h"
+#include "timing.h"
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <omp.h>
+#include <vector>
+using namespace std;
+
+
+int single_goldbach_par(int k) {
+    const vector<int> primes = get_primes(k);
+    int count = 0;
+
+    #pragma omp parallel for reduction(+:count)
+    for (size_t i = 0; i < primes.size(); i++) {
+        for (size_t j = i; j < primes.size(); j++) {
+            if (primes[i] + primes[j] == k) {
+                count++;
+            }
+        }
+    }
+
+    return count;
+}
+
+
+vector<int> count_goldbach_par(int n) {
+    const vector<int> primes = get_primes(n);
+    vector<int> counts(n+1);
+
+    #pragma omp parallel reduction(VecAdd:counts)
+    // #pragma omp parallel
+    {
+        vector<int> local_counts(n+1, 0);
+
+            #pragma omp for
+        for (size_t i = 1; i < primes.size(); i++) {
+            for (size_t j = i; j < primes.size(); j++) {
+                int sum = primes[i] + primes[j];
+                if (sum <= n) {
+                    local_counts[sum]++;
+                }
+            }
+        }
+
+        counts += local_counts;
+        // #pragma omp critical
+        // {
+        //     for(int k=0; k<n+1; k++){
+        //     counts[k] += local_counts[k];
+        //     }
+        // }
+    }
+    return counts;
+}
+
+
+void print_decomps(int k) {
+    const vector<int> primes = get_primes(k);
+    cout << "\nDecompositions for k = " << k << ": ";
+
+    for (size_t i = 0; i < primes.size(); i++) {
+        for (size_t j = i; j < primes.size(); j++) {
+            if (primes[i] + primes[j] == k) {
+                cout << primes[i] << " + " << primes[j] << ", ";
+            }
+        }
+    }
+    cout << endl;
+}
--- a/ex5/code/task_3.h
+++ b/ex5/code/task_3.h
@ -0,0 +1,33 @@
+#pragma once
+#include <cassert>
+#include <vector>
+using namespace std;
+
+
+// Counts number of possible decompositions with 2 primes that sum up to k.
+int single_goldbach_par(int k);
+
+// Counts number of possible decompositions with 2 primes that sum up to k for all even numbers k \in {4,...,n}.
+vector<int> count_goldbach_par(int n);
+
+// Prints all decompositions of k.
+void print_decomps(int k);
+
+
+/** 	 Vector @p b adds its elements to vector @p a .
+	@param[in] a	vector
+	@param[in] b	vector
+	@return 	     a+=b componentwise
+*/
+template<class T>
+std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
+{
+    assert(a.size()==b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+        a[k] += b[k];
+    }
+    return a;
+}
+
+#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
+ initializer (omp_priv=omp_orig)
--- a/ex5/code/task_4.cpp
+++ b/ex5/code/task_4.cpp
@ -0,0 +1,232 @@
+#include "task_4.h"
+#include "timing.h"
+#include <cassert>
+#include <cblas.h>               // cBLAS Library
+#include <iostream>
+#include <vector>
+using namespace std;
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
+    size_t const N = x.size();
+    size_t const M = A.size() / N;
+    vector<double> b(M);
+
+    #pragma omp parallel for shared(A,x,N,M,b)
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            b[i] += A[i*N + j] * x[j];
+        }
+    }
+    return b;
+}
+
+
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
+    size_t const L = A.size() / M;
+    size_t const N = B.size() / L;
+    vector<double> C(M*N,0);
+
+    #pragma omp parallel for shared(A,B,M,L,N,C)
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t k = 0; k < L; ++k) {
+            for (size_t j = 0; j < N; ++j) {
+                C[i*N + j] += A[i*L + k] * B[k*N + j];
+            }
+        }
+    }
+    return C;
+}
+
+
+vector<double> poly(vector<double> const &x, vector<double> const &a) {
+    size_t N = x.size();
+    size_t p = a.size();
+    vector<double> y(N);
+
+    #pragma omp parallel for shared(x,a,N,p,y)
+    for (size_t i = 0; i < N; ++i) {
+        y[i] = a[p];
+        for (size_t k = 1; k < p; ++k) {
+            y[i] = y[i]*x[i] + a[p-k];
+        }
+    }
+    return y;
+}
+
+double scalar(vector<double> const &x, vector<double> const &y) {
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+
+    #pragma omp parallel for shared(x,y,N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+double summation(vector<double> const &x){
+    size_t N = x.size();
+    double sum = 0.0;
+
+    #pragma omp parallel for shared(x,N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i];
+    }
+
+    return sum;
+}
+
+
+
+// ##########################################################################
+
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
+    printf("Memory allocated  : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
+    printf("Duration per loop : %.3f sec\n", sec);
+    printf("GFLOPS            : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
+    printf("GiByte/s          : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
+}
+
+tuple<vector<double>, vector<double>> init_A(size_t N) {
+    vector<double> x(N), y(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+        y[i] = 1.0 / x[i];
+    }
+    return make_tuple(x, y);
+}
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+
+    double s(0.0), sum(0.0);
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = scalar(x, y);
+            sum += s;
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = cblas_ddot(N, x.data(), 1, y.data(), 1);
+            sum += s;
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
+    vector<double> A(M*N), x(N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            A[i*N + j] = (i+j)%219 + 1.0;
+        }
+    }
+    for (size_t j = 0; j < N; ++j) {
+        x[j] = 1.0/A[17*N + j];
+    }
+    return make_tuple(A, x);
+}
+
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+    size_t M = A.size() / N;
+    vector<double> b(M);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            b = matrix_vec(A,x);
+            sum += b[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
+            sum += b[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
+    vector<double> A(M*L), B(L*N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < L; ++j) {
+            A[i*L + j] = (i+j)%219 + 1.0;
+        }
+    }
+    // B chosen such that C[0,17]=L
+    // so B[i,17] = 1/A[0,i]
+    for (size_t i = 0; i < L; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (j==17) {
+                B[i*N + 17] = 1.0/A[i];
+            } else {
+                B[i*N + j] = (i+j)%219 + 1.0;
+            }
+        }
+    }
+    return make_tuple(A, B);
+}
+
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
+    size_t M = A.size() / L;
+    size_t N = B.size() / L;
+    vector<double> C(M*N);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            C = matrix_matrix(A,B,M);
+            sum += C[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+            sum += C[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != L*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
+    // x_i = i/N for i=0,...,N-1
+    // a_j = 1   for j=0,...,p-1
+    vector<double> x(N), a(p);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = static_cast<double>(i) / N;
+    }
+    for (size_t j = 0; j < p; ++j) {
+        a[j] = 1.0;
+    }
+    return make_tuple(x, a);
+}
+
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
+    size_t N = x.size();
+    vector<double> y(N);
+    double sum(0.0);
+
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        y = poly(x,a);
+        sum += y[0];
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != NLOOPS) {printf("  !!   W R O N G  result sum = %f  !!\n", sum);}
+}
+
+void benchmark_summation(vector<double> const &x, size_t NLOOPS) {
+    double s(0.0), sum(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        s = summation(x);
+        sum += s;
+    }
+}
--- a/ex5/code/task_4.h
+++ b/ex5/code/task_4.h
@ -0,0 +1,22 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
+vector<double> poly(vector<double> const &x, vector<double> const &a);
+double scalar(vector<double> const &x, vector<double> const &y);
+double summation(vector<double> const &x);
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
+tuple<vector<double>, vector<double>> init_A(size_t N);
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
+
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
+void benchmark_summation(vector<double> const &x, size_t NLOOPS);
--- a/ex5/code/timing.h
+++ b/ex5/code/timing.h
@ -0,0 +1,51 @@
+//
+//    Gundolf Haase, Oct 18 2024
+//
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+//using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining can be nested and the recent time point is stored on top of the stack.
+ * 
+ *  @return recent time point
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time point from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}
--- a/ex5/demo_skalar/CLANG_default.mk
+++ b/ex5/demo_skalar/CLANG_default.mk
@ -0,0 +1,131 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=CLANG_
+# https://llvm.org/docs/CompileCudaWithLLVM.html
+# https://llvm.org/docs/NVPTXUsage.html
+
+#CLANGPATH=//usr/lib/llvm-10/bin/
+CC     = ${CLANGPATH}clang
+CXX    = ${CLANGPATH}clang++
+#CXX   = ${CLANGPATH}clang++ -lomptarget  -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
+#F77   = gfortran
+LINKER = ${CXX}
+
+#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
+WARNINGS += -pedantic -Weverything -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion
+WARNINGS += -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
+WARNINGS += -Wno-unsafe-buffer-usage
+#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+
+CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
+# don't use -Ofast
+# -ftrapv
+LINKFLAGS += -O3
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+ifeq ($(UBUNTU),1)
+# ubuntu
+else
+# on  archlinux
+LINKFLAGS += -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+#sudo apt install libomp-dev
+# OpenMP
+CXXFLAGS += -fopenmp
+LINKFLAGS += -fopenmp
+
+#   very good check
+# http://clang.llvm.org/extra/clang-tidy/
+#   good check, see:  http://llvm.org/docs/CodingStandards.html#include-style
+SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
+SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
+#READABILITY=,readability*${SWITCH_OFF}
+#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+#TIDYFLAGS += -checks='modernize*
+#   ???
+#TIDYFLAGS = -checks='cert*'  -header-filter=.*
+#   MPI checks ??
+#TIDYFLAGS = -checks='mpi*'
+#   ??
+#TIDYFLAGS = -checks='performance*'   -header-filter=.*
+#TIDYFLAGS = -checks='portability-*'  -header-filter=.*
+#TIDYFLAGS = -checks='readability-*'  -header-filter=.*
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+codecheck: tidy_check
+tidy_check:
+	clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
+# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+codecheck: tidy_check
--- a/ex5/demo_skalar/GCCMKL_default.mk
+++ b/ex5/demo_skalar/GCCMKL_default.mk
@ -0,0 +1,212 @@
+# Basic Defintions for using GNU-compiler suite with OpenMP und MKL
+# requires setting of COMPILER=GCCMKL_
+
+# install MKL in manjaro
+# https://linux-packages.com/manjaro-linux/package/intel-mkl
+# >   sudo pacman -Sy 
+# >   sudo pacman -S intel-mkl 
+
+ifeq ($(ONEAPI),1)
+MKL_INCLUDE=/opt/intel/oneapi/mkl/2024.0/include
+MKL_LIB=/opt/intel/oneapi/2024.0/lib
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/oneapi/2024.0/lib
+else
+MKL_INCLUDE=/usr/include/mkl
+MKL_LIB=/usr/lib/x86_64-linux-gnu/mkl
+endif
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#  -Wunreachable-code
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+CPU = -march=znver2
+#CPU = -march=core-avx2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# MKL
+#CXXFLAGS  += -I/usr/include/mkl -DUSE_MKL -Wno-redundant-decls
+CXXFLAGS  += -I${MKL_INCLUDE} -DUSE_MKL -Wno-redundant-decls
+#LINKFLAGS += -lmkl_intel_lp64 -lmkl_tbb_thread  -ltbb -lmkl_core
+#LINKFLAGS += -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -L/usr/lib/x86_64-linux-gnu/mkl 
+LINKFLAGS += -L${MKL_LIB} -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core
+#LINKFLAGS += -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
+
+# workaround for  MKL slow down on AMD hardware
+# https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
+default: run
+libfakeintel.so: 
+	gcc -shared -fPIC -o libfakeintel.so fakeintel.c
+    echo "call:  export LD_PRELOAD=./libfakeintel.so "
+
+# different libraries in Ubuntu or manajaro
+#ifndef UBUNTU
+#UBUNTU=1
+#endif
+
+## BLAS, LAPACK
+#ifeq ($(UBUNTU),1)
+#LINKFLAGS += -llapack -lblas
+## -lopenblas
+#else
+## on  archlinux
+#LINKFLAGS += -llapack -lopenblas -lcblas
+#endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# OpenMP
+CXXFLAGS += -fopenmp
+LINKFLAGS += -fopenmp
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM} libfakeintel.so
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+#find . -size +10M > large_files
+#--exclude-from ${MY_DIR}/large_files
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS} 11
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/ex5/demo_skalar/GCC_SINGLE_default.mk
+++ b/ex5/demo_skalar/GCC_SINGLE_default.mk
@ -0,0 +1,111 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+# on mephisto:
+#CXXFLAGS  += -I/share/apps/atlas/include
+#LINKFLAGS += -L/share/apps/atlas/lib -L/usr/lib64/atlas
+#LINKFLAGS   += -latlas -lcblas
+#LINKFLAGS   += -lblas
+# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
+
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#  -Wunreachable-code
+#CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native -DNDEBUG ${WARNINGS}
+CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native ${WARNINGS}
+
+# info on vectorization
+#VECTORIZE = -ftree-vectorize -fdump-tree-vect-blocks=foo.dump
+#-fdump-tree-pre=stderr
+VECTORIZE = -ftree-vectorize -fopt-info -ftree-vectorizer-verbose=5
+#CXXFLAGS += ${VECTORIZE}
+# -funroll-all-loops   -msse3
+#GCC  -march=knl -march=broadwell -march=haswell
+
+# interprocedural optimization
+#CXXFLAGS += -flto
+LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the trhee can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS +=${SANITARY}
+
+# OpenMP  but no OpenMP in Single mode
+#CXXFLAGS += -fopenmp
+LINKFLAGS += -fopenmp
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	@rm -rf html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar:
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
+
+
+thread:${PROGRAM}
+	valgrind -v --tool=helgrind --log-file=$^.thread.out ./$^
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	./$^
+	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
--- a/ex5/demo_skalar/GCC_default.mk
+++ b/ex5/demo_skalar/GCC_default.mk
@ -0,0 +1,182 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#  -Wunreachable-code
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++20 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+#CPU = -march=znver2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+ifeq ($(UBUNTU),1)
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+else
+# on  archlinux
+LINKFLAGS += -llapack -lopenblas -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# OpenMP
+CXXFLAGS += -fopenmp
+LINKFLAGS += -fopenmp
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM}
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+#find . -size +10M > large_files
+#--exclude-from ${MY_DIR}/large_files
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS} 11
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 -I${CUDA_INC} --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/ex5/demo_skalar/ICC_default.mk
+++ b/ex5/demo_skalar/ICC_default.mk
@ -0,0 +1,151 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ICC_
+
+# special on my sony [GH]
+#BINDIR = /opt/save.intel/bin/
+# very special on my sony [GH]
+# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
+
+#export KMP_AFFINITY=verbose,compact
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}icpc
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+WARNINGS = -pedantic -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012
+          #-Winline -Wunreachable-code  -Wredundant-decls
+CXXFLAGS +=  -std=c++17 -O3  -fma -DNDEBUG ${WARNINGS} -mkl
+#CXXFLAGS +=  -std=c++17 -O3 -march=core-avx2  -fma -ftz -fomit-frame-pointer -DNDEBUG ${WARNINGS} -mkl
+# -fast       # fast inludes also -ipo !
+CXXFLAGS +=  -fargument-noalias -fargument-noalias-global -ansi-alias
+CXXFLAGS +=  -align -qopt-dynamic-align
+#CXXFLAGS +=  -xCore-AVX2
+#CXXFLAGS +=  -tp=zen
+# -qopt-subscript-in-range
+# -vec-threshold0
+# -xCORE-AVX2
+# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
+# -ipo
+
+# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=vec,par
+
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=cg
+# Redirect report from *.optrpt to stderr
+#    -qopt-report-file=stderr
+# Guided paralellization
+#    -guide -parallel
+#    -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+#    -auto-p32 -simd
+
+# interprocedural optimization
+#CXXFLAGS += -ipo
+#LINKFLAGS += -ipo
+
+# annotated Assembler file
+ANNOTED = -fsource-asm -S 
+
+# OpenMP
+CXXFLAGS += -qopenmp
+# -qopt-report-phase=openmp
+# -diag-enable=sc-full  -diag-file=filename -diag-file-append[=filename]
+LINKFLAGS += -qopenmp
+
+# use MKL by INTEL
+# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+LINKFLAGS += -O2 -mkl
+# -ipo
+
+
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS} *.optrpt
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# # Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# cache: ${PROGRAM}
+# 	valgrind --tool=callgrind --simulate-cache=yes ./$^
+# #	kcachegrind callgrind.out.<pid> &
+#
+# # Check for wrong memory accesses, memory leaks, ...
+# # use smaller data sets
+# mem: ${PROGRAM}
+# 	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
+#
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g -pg
+# #  LINKFLAGS += -pg
+# prof: ${PROGRAM}
+# 	./$^
+# 	gprof -b ./$^ > gp.out
+# #	kprof -f gp.out -p gprof &
+#
+
+
+mem: inspector
+prof: amplifier
+cache: amplifier
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
+	amplxe-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	#${BINDIR}../inspector_xe_2013/bin64/inspxe-gui &
+	inspxe-gui &
+
+advisor:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
+	export ADVIXE_EXPERIMENTAL=roofline
+	advixe-gui &
+
+icc-info:
+	icpc -# main.cpp
+
+
+
+
--- a/ex5/demo_skalar/ONEAPI_default.mk
+++ b/ex5/demo_skalar/ONEAPI_default.mk
@ -0,0 +1,181 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ONEAPI_
+
+#         https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# requires
+# source /opt/intel/oneapi/setvars.sh
+# on  AMD:    export MKL_DEBUG_CPU_TYPE=5
+
+#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
+#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
+#export KMP_AFFINITY=verbose,compact
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}dpcpp
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+## Compiler flags
+WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
+WARNINGS += -Wpessimizing-move -Wredundant-move
+#-wd2015,2012,2014 -wn3
+#    -Winline -Wredundant-decls -Wunreachable-code
+# -qopt-subscript-in-range
+# -vec-threshold0
+
+CXXFLAGS += -O3 -std=c++17 -tbb ${WARNINGS}
+#            https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned
+#            needed on Desktop-PC Haase (not needed on mephisto)
+CXXFLAGS += -D_GLIBCXX_USE_TBB_PAR_BACKEND=0
+#CXXFLAGS += -DMKL_ILP64  -I"${MKLROOT}/include"
+#CXXFLAGS += -DMKL_ILP32  -I"${MKLROOT}/include"
+LINKFLAGS += -O3 -tbb
+#LINKFLAGS += -no-prec-div
+
+# interprocedural optimization
+CXXFLAGS  += -ipo
+LINKFLAGS += -ipo
+LINKFLAGS += -flto
+
+# annotated Assembler file
+ANNOTED = -fsource-asm -S 
+
+#architecture
+CPU  = -march=core-avx2
+#CPU += -mtp=zen
+# -xCORE-AVX2
+# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# use MKL by INTEL
+# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# sequential MKL
+#                              use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
+CXXFLAGS +=  -qmkl=sequential  -UMKL_ILP64
+LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+
+# shared libs:  https://aur.archlinux.org/packages/intel-oneapi-compiler-static
+#     install intel-oneapi-compiler-static   
+# or 
+LINKFLAGS += -shared-intel
+
+
+OPENMP = -qopenmp
+CXXFLAGS += ${OPENMP}
+LINKFLAGS += ${OPENMP}
+
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+# -vec-report=3
+# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
+# -guide -parallel
+# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+# -auto-p32 -simd
+
+# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=vec,par
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=cg
+# Redirect report from *.optrpt to stderr
+#    -qopt-report-file=stderr
+# Guided paralellization
+#    -guide -parallel
+#    -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+#    -auto-p32 -simd
+
+## run time checks
+# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
+
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS} *.optrpt
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
+
+mem: inspector
+prof: vtune
+cache: inspector
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
+	amplxe-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	inspxe-gui &
+	vtune-gui ./${PROGRAM} &
+
+advisor:
+#	source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
+#	/opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
+	advisor --collect=survey ./${PROGRAM} ${PARAMS}
+#	advisor --collect=roofline ./${PROGRAM} ${PARAMS}
+	advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
+#	advisor-gui
+
+vtune:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
+	export ADVIXE_EXPERIMENTAL=roofline
+	vtune -collect hotspots ./${PROGRAM} ${PARAMS}
+	vtune -report hotspots -r r000hs > vtune.out
+#	vtune-gui ./${PROGRAM} &	
+
+icc-info:
+	icpc -# main.cpp
+
+# MKL on AMD
+# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
+#
+# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
+# export MKL_DEBUG_CPU_TYPE=5
+# export MKL_NUM_THRAEDS=1
+# export MKL_DYNAMIC=false
+#  on Intel compiler
+# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
--- a/ex5/demo_skalar/PGI_default.mk
+++ b/ex5/demo_skalar/PGI_default.mk
@ -0,0 +1,96 @@
+# Basic Defintions for using PGI-compiler suite sequentially
+# requires setting of COMPILER=PGI_
+# OPTIRUN = optirun
+# on mephisto:
+#CXXFLAGS  += -I/share/apps/atlas/include
+#LINKFLAGS += -L/share/apps/atlas/lib
+#LINKFLAGS   += -lcblas -latlas
+
+LINKFLAGS   += -lblas
+
+CC	= pgcc
+CXX     = pgc++
+F77	= pgfortran
+LINKER  = ${CXX}
+
+
+WARNINGS = -Minform=warn
+
+#PGI_PROFILING = -Minfo=loop,vect,opt,intensity,mp,accel
+PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
+
+# -Minfo
+# -Mprof=lines
+
+CXXFLAGS += -std=c++14 -O3 -fast  -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
+CXXFLAGS += -Mvect -Mcache_align -Msafeptr -Mprefetch -Mlre -Mdepchk
+#-Msmart  
+
+LINKFLAGS   += ${PGI_PROFILING}
+#-lcblas
+# OpenMP
+CXXFLAGS += -mp=align,bind,numa -Mneginfo=mp
+LINKFLAGS += -mp=allcores,bind,numa
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g
+# #  LINKFLAGS +=
+
+
+# Profiling options PGI, see: pgprof -h
+PROF_FILE = jac.pgprof
+# CPU_PROF = -allcache
+CPU_PROF = --cpu-profiling on --analysis-metrics 
+# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
+#GPU_PROF = -cuda=branch:cc20
+#
+
+cache: prof
+
+prof: ${PROGRAM}
+#	./$^
+#	$(CUDA_HOME)/bin/nvvp &
+#  more  /opt/pgi/linux86-64/16.10/bin/pgcollectrc
+	${OPTIRUN} ${BINDIR}pgprof ${CPU_PROF} -o $(PROF_FILE) ./$^
+	${OPTIRUN} ${BINDIR}pgprof -i  $(PROF_FILE) 2> prof.out
+	
+
--- a/ex5/demo_skalar/demo_skalar/Doxyfile
+++ b/ex5/demo_skalar/demo_skalar/Doxyfile
--- a/ex5/demo_skalar/demo_skalar/Makefile
+++ b/ex5/demo_skalar/demo_skalar/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/ex5/demo_skalar/demo_skalar/check_env.h
+++ b/ex5/demo_skalar/demo_skalar/check_env.h
@ -0,0 +1,99 @@
+#pragma once
+
+#include <iostream>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <unordered_map>
+
+//#####################################
+// G.Haase
+// See https://sourceforge.net/p/predef/wiki/Compilers/
+//     http://www.cplusplus.com/doc/tutorial/preprocessor/
+//  also:  export OMP_DISPLAY_ENV=VERBOSE
+//#####################################
+/** 	Checks for compilers, its versions, threads etc.
+ * 
+	@param[in] argc	number of command line arguemnts
+	@param[in] argv	command line arguments as array of C-strings
+*/
+template <class T>
+void check_env(T argc, char const *argv[])
+{
+    std::cout << "\n#######################################################################\n";
+    std::cout << "Code    :";
+    for (T k = 0; k < argc; ++k) std::cout << "  " << argv[k];
+    std::cout << std::endl;
+
+    // compiler:      https://sourceforge.net/p/predef/wiki/Compilers/
+    std::cout <<    "Compiler:  ";
+#if defined __INTEL_COMPILER
+#pragma message(" ##########  INTEL  ###############")
+    std::cout << "INTEL " << __INTEL_COMPILER;
+    // Ignore warnings for #pragma acc   unrecognice
+#pragma warning disable 161
+    // Ignore warnings for #pragma omp   unrecognice
+#pragma warning disable 3180
+
+#elif defined __PGI
+#pragma message(" ##########  PGI    ###############")
+    std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
+#elif defined  __clang__
+#pragma message(" ##########  CLANG    ###############")
+    std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
+#elif defined __GNUC__
+#pragma message(" ##########  Gnu    ###############")
+    std::cout << "Gnu " <<  __GNUC__  << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
+#else
+#pragma message(" ##########  unknown Compiler   ###############")
+    std::cout << "unknown";
+#endif
+    std::cout << "  C++ standard: " << __cplusplus << std::endl;
+
+    // Parallel environments
+    std::cout <<    "Parallel:  ";
+#if defined MPI_VERSION
+#pragma message(" ##########  MPI    ###############")
+#ifdef OPEN_MPI
+    std::cout << "OpenMPI ";
+#else
+    std::cout << "MPI ";
+#endif
+    std::cout << MPI_VERSION << "." << MPI_SUBVERSION << "   ";
+#endif
+
+#ifdef _OPENMP
+//https://www.openmp.org/specifications/
+//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
+    std::unordered_map<unsigned, std::string> const map{
+        {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
+#pragma message(" ##########  OPENMP    ###############")
+    //std::cout << _OPENMP;
+    std::cout << "OpenMP ";
+    try {
+        std::cout << map.at(_OPENMP);
+    }
+    catch (...) {
+        std::cout << _OPENMP;
+    }
+    #pragma omp parallel
+    {
+        #pragma omp master
+        {
+            const int nn = omp_get_num_threads();          // OpenMP
+            std::cout << " ---> " <<  nn << " Threads   ";
+        }
+        #pragma omp barrier
+    }
+
+#endif
+#ifdef _OPENACC
+#pragma message(" ##########  OPENACC    ###############")
+    std::cout << "OpenACC   ";
+#endif
+    std::cout << std::endl;
+    std::cout << "Date    :  " << __DATE__ << "  " << __TIME__;
+    std::cout << "\n#######################################################################\n";
+}
+// HG
+
--- a/ex5/demo_skalar/demo_skalar/main.cpp
+++ b/ex5/demo_skalar/demo_skalar/main.cpp
@ -0,0 +1,132 @@
+#include "check_env.h"
+#include "mylib.h"
+#include <cstdlib>          // atoi()
+#include <cstring>          // strncmp()
+#include <ctime>
+#include <iostream>
+#include <omp.h>            // OpenMP
+#include <sstream>
+#include <string>
+using namespace std;
+
+int main(int argc, char const *argv[])
+{
+    
+    omp_set_num_threads(4);
+
+// #################################################################
+    unsigned int N = 50000001;
+    int const NLOOPS = 50;        // chose a value such that the benchmark runs at least 10 sec.
+
+    // unsigned int N = 11;
+    // int const NLOOPS = 1;
+    //##########################################################################
+//   Read Parameter from command line  (C++ style)
+    cout << "Checking command line parameters for: -n <number> " << endl;
+    for (int i = 1; i < argc; i++)
+    {
+        cout << " arg[" << i << "] = " << argv[i] << endl;
+        string ss(argv[i]);
+        if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
+        {
+            N = static_cast<unsigned int>(atoi(argv[i + 1]));
+        }
+        else
+        {
+            cout << "Corect call: " << argv[0] << " -n  <number>\n";
+        }
+    }
+
+    cout << "\nN = " << N << endl;
+    
+    check_env(argc, argv);
+//########################################################################
+    int nthreads;                                  // OpenMP
+    #pragma omp parallel default(none) shared(cout,nthreads)
+    {
+        int const th_id  = omp_get_thread_num();   // OpenMP
+        int const nthrds = omp_get_num_threads();  // OpenMP
+        stringstream ss;
+        ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
+        #pragma omp critical
+        {
+            cout << ss.str();                      // output to a shared ressource
+        }
+        #pragma omp master
+        nthreads = nthrds;                         // transfer nn to to master thread
+    }
+    cout << "   " << nthreads << "   threads have been started." << endl;
+
+//##########################################################################
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//##########################################################################
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        x[i] = i + 1.0;
+        y[i] = 1.0 / x[i];
+    }
+
+//##########################################################################
+    cout << "\nStart Benchmarking\n";
+
+// Do calculation
+    double tstart = omp_get_wtime();                  // OpenMP
+
+    double sk(0.0);
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        sk = scalar2(x, y);
+    }
+
+    double t1 = omp_get_wtime() - tstart;             // OpenMP
+    t1 /= NLOOPS;           // divide by number of function calls
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+
+    cout.precision(2);
+    cout << "Timing in sec. : " << t1 << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+
+//#########################################################################
+
+    // ---- Speedup for scalar ----
+    // Threads |  Time  | Speedup
+    //     1   | 0.032s |  1
+    //     2   | 0.022s |  1.45
+    //     4   | 0.021s |  1.52
+    //     8   | 0.022s |  1.45
+    //    16   | 0.022s |  1.45
+
+//#########################################################################
+
+
+    cout << "\n  Try the reduction with an STL-vektor \n";
+    
+    auto vr = reduction_vec_append(10);
+    cout << "done\n";
+    cout << vr << endl;
+
+
+    return 0;
+}  // memory for x and y will be deallocated their destructors
--- a/ex5/demo_skalar/demo_skalar/mylib.cpp
+++ b/ex5/demo_skalar/demo_skalar/mylib.cpp
@ -0,0 +1,84 @@
+#include "mylib.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <iostream>
+#include <functional>    // multiplies<>{}
+#include <list>
+#include <numeric>       // iota()
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <vector>
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(static)
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+double scalar2(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+
+    #pragma omp parallel shared(x,y,N,sum)
+    {
+        double local_sum = 0.0;
+
+        int const th_id  = omp_get_thread_num();
+        int const nthrds = omp_get_num_threads();
+
+        for (size_t i = th_id; i < N; i += nthrds) {
+            local_sum += x[i] * y[i];
+        }
+
+        #pragma omp atomic
+        sum += local_sum;
+    }
+    return sum;
+}
+
+
+vector<int> reduction_vec(int n)
+{ 
+    vector<int> vec(n);
+#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
+    {
+        #pragma omp barrier
+        #pragma omp critical
+        cout << omp_get_thread_num() << " : " << vec.size() << endl;
+        #pragma omp barrier
+        iota( vec.begin(),vec.end(), omp_get_thread_num() );
+        #pragma omp barrier
+        #pragma omp critical
+        cout << vec << endl;
+    }
+    return vec;
+}
+
+vector<int> reduction_vec_append(int n)
+{ 
+    vector<int> vec;
+#pragma omp parallel default(none) shared(cout,n) reduction(VecAppend:vec)
+    {
+        vector<int> local(n);
+        #pragma omp barrier
+        #pragma omp critical
+        cout << omp_get_thread_num() << " : " << local.size() << endl;
+        #pragma omp barrier
+        iota( local.begin(),local.end(), omp_get_thread_num() );
+        #pragma omp critical
+        vec.insert(vec.end(), local.begin(), local.end());
+    }
+    return vec;
+}
--- a/ex5/demo_skalar/demo_skalar/mylib.h
+++ b/ex5/demo_skalar/demo_skalar/mylib.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include <cassert>
+#include <iomanip>             // setw()
+#include <iostream>
+#include <omp.h>
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+double scalar2(std::vector<double> const &x, std::vector<double> const &y);
+
+/** 	 Vector @p b adds its elements to vector @p a .
+	@param[in] a	vector
+	@param[in] b	vector
+	@return 	     a+=b componentwise
+*/
+template<class T>
+std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
+{
+    assert(a.size()==b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+        a[k] += b[k];
+    }
+    return a;
+}
+
+// Declare the reduction operation in OpenMP for an STL-vector
+//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
+// ------------------------------------------------------------
+// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
+// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
+// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
+//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
+#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
+ initializer (omp_priv=omp_orig)
+
+//   Templates are  n o t  possible, i.e. the reduction has to be declared fore a specified type.
+//template <class T>
+//#pragma omp declare reduction(VecAdd : std::vector<T>  : omp_out += omp_in) initializer (omp_priv(omp_orig))
+// MS: template nach #pragma  !?
+
+// ------------------------------------------------------------
+
+
+/** 	Test for vector reduction.
+ * 
+ * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
+ * Afterwards these vectors are accumulated, i.e., 
+ * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
+ * 
+ * 	@param[in] n  size of global/private vector
+ * 	@return  resulting global vector.
+*/
+std::vector<int> reduction_vec(int n);
+
+
+
+/** 	Output of a vector.
+	@param[in,out] s	output stream
+	@param[in]     x	vector
+	@return 	   modified output stream
+*/
+template <class T>
+std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
+{
+    for (auto const &v : x)  s << std::setw(4) << v << "  ";
+    return s;
+}
+
+#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end()) ) initializer( omp_priv = std::vector<int>() )
+ 
+std::vector<int> reduction_vec_append(int n);
--- a/ex5/demo_skalar/demo_skalar/timing.h
+++ b/ex5/demo_skalar/demo_skalar/timing.h
@ -0,0 +1,70 @@
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+//using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining is allowed to be nested and the recent time is stored on top of the stack.
+ * 
+ *  @return recent time
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}
+
+#include <iostream>
+#include <string>
+/** Executes function @p f and measures/prints elapsed wall clock time in seconds
+ * 
+ * Call as
+ * @code measure("Time for (b = b + 1)", [&]() {
+        thrust::transform(b.begin(), b.end(), b.begin(), increment());
+    }); @endcode
+ * 
+ * @param[in]  label  additional string to be printed with the measurement.
+ * @param[in]  f      function to execute.
+ * @author Therese Bösmüller, 2025
+ *
+*/ 
+auto measure = [](const std::string& label, auto&& f) {
+        auto start    = std::chrono::high_resolution_clock::now();
+        f();
+        auto stop     = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+        std::cout << label << ": " << duration << " microseconds" << std::endl;
+};          // ';' is needed for a visible documentation of this lambda-function
--- a/ex5/ex5_results.txt
+++ b/ex5/ex5_results.txt
--- a/ex5/ex5_sheet.pdf
+++ b/ex5/ex5_sheet.pdf