ex3

2025-11-11 13:36:28 +01:00 · 2025-11-11 13:36:28 +01:00 · 7a02dff345
commit 7a02dff345
parent f0cd37fddc
29 changed files with 3943 additions and 1 deletions
--- a/ex1/code/Makefile
+++ b/ex1/code/Makefile
@ -22,7 +22,7 @@ ${PROGRAM}:	${OBJECTS}
 	$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}

 clean:
-	rm -f ${OBJECTS} ${PROGRAM}
+	rm -f ${OBJECTS} ${PROGRAM} out_1.txt


 run: ${PROGRAM}
--- a/ex3/code/Makefile
+++ b/ex3/code/Makefile
@ -0,0 +1,30 @@
+PROGRAM	= main
+
+SOURCES = $(wildcard *.cpp)
+OBJECTS = ${SOURCES:.cpp=.o}
+
+CXX     = g++
+LINKER  = g++
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+
+CXXFLAGS   = -g -flto -O3 -ffast-math -march=native ${WARNINGS}
+LINKFLAGS  = -g -flto -O3 -lopenblas -llapacke
+
+
+all: ${PROGRAM}
+
+# %.o: %.cpp
+# 	${CXX} ${CXXFLAGS} -c $< -o $@
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
+
+clean:
+	rm -f ${OBJECTS} ${PROGRAM}
+
+
+run: ${PROGRAM}
+# run: clean ${PROGRAM}
+	./${PROGRAM}
--- a/ex3/code/main.cpp
+++ b/ex3/code/main.cpp
@ -0,0 +1,267 @@
+
+#include "task_3.h"
+#include "task_4+6.h"
+#include "task_5.h"
+#include "task_7.h"
+#include "timing.h"
+
+#include <iomanip>
+#include <iostream>
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>
+
+void task_1() {
+    printf("\n\n-------------- Task 1 --------------\n\n");
+    printf("See comment in main.cpp");
+
+    // -------------------------------------------------------------
+    // STREAM version $Revision: 5.10 $
+    // -------------------------------------------------------------
+    // This system uses 8 bytes per array element.
+    // -------------------------------------------------------------
+    // Array size = 80000000 (elements), Offset = 0 (elements)
+    // Memory per array = 610.4 MiB (= 0.6 GiB).
+    // Total memory required = 1831.1 MiB (= 1.8 GiB).
+    // Each kernel will be executed 20 times.
+    //  The *best* time for each kernel (excluding the first iteration)
+    //  will be used to compute the reported bandwidth.
+    // -------------------------------------------------------------
+    // Your clock granularity/precision appears to be 1 microseconds.
+    // Each test below will take on the order of 116886 microseconds.
+    //    (= 116886 clock ticks)
+    // Increase the size of the arrays if this shows that
+    // you are not getting at least 20 clock ticks per test.
+    // -------------------------------------------------------------
+    // WARNING -- The above is only a rough guideline.
+    // For best results, please be sure you know the
+    // precision of your system timer.
+    // -------------------------------------------------------------
+    // Function    Best Rate MB/s  Avg time     Min time     Max time
+    // Copy:           29569.4     0.048585     0.043288     0.059164
+    // Scale:          17644.0     0.082248     0.072546     0.102548
+    // Add:            21030.1     0.100620     0.091298     0.124700
+    // Triad:          21230.7     0.100758     0.090435     0.120631
+    // -------------------------------------------------------------
+    // Solution Validates: avg error less than 1.000000e-13 on all three arrays
+    // -------------------------------------------------------------
+    // ./flops.exe
+
+    //    FLOPS C Program (Double Precision), V2.0 18 Dec 1992
+
+    //    Module     Error        RunTime      MFLOPS
+    //                             (usec)
+    //      1      4.0146e-13      0.0024   5827.9076
+    //      2     -1.4166e-13      0.0007  10037.8942
+    //      3      4.7184e-14      0.0039   4371.9185
+    //      4     -1.2557e-13      0.0034   4355.5711
+    //      5     -1.3800e-13      0.0066   4415.6439
+    //      6      3.2380e-13      0.0065   4441.6299
+    //      7     -8.4583e-11      0.0053   2277.1707
+    //      8      3.4867e-13      0.0069   4367.6094
+
+    //    Iterations      =  512000000
+    //    NullTime (usec) =     0.0000
+    //    MFLOPS(1)       =  7050.6178
+    //    MFLOPS(2)       =  3461.6233
+    //    MFLOPS(3)       =  4175.0442
+    //    MFLOPS(4)       =  4389.7311
+}
+
+void task_2() {
+    printf("\n\n-------------- Task 2 --------------\n\n");
+    printf("See comment in main.cpp");
+
+    // Memory needed (double 64-bit, 8 bytes):
+    // (A) (2N + 1) * 8 bytes
+    // (B) (M*N + M + N) * 8 bytes
+    // (C) (M*L + L*N + M*N) * 8 bytes
+    // (D) (N + N + p) * 8 bytes
+
+    // Floating point operations:
+    // (A) 2N
+    // (B) M * 2N
+    // (C) M * 2L * N
+    // (D) 2 * N * p (Horner Schema)
+
+    // Read/Write operations:
+    // (A) Read: 2N         Write: 1
+    // (B) Read: M*2N       Write: M*N
+    // (C) Read: M*2L*N     Write: M*L*N
+    // (D) Read: 2*N*p      Write: N*P
+}
+
+void task_3() {
+    printf("\n\n-------------- Task 3 --------------\n\n");
+    printf("Functions implemented in task_3.cpp");
+}
+
+void task_4(bool cblas = false) {
+    if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
+    size_t M, N, L, p, NLOOPS;
+
+    { //          Scalar product
+    printf("----- Benchmark (A) -----\n");
+    // Initialization
+        N = 50'000'000;
+        NLOOPS = 50;
+        auto [x,y] = init_A(N);
+    // Benchmark
+        tic();
+        benchmark_A(x, y, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = 2 * N;
+        size_t flops  = 2 * N;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+
+    { //      Matrix-Vector product
+    printf("----- Benchmark (B) -----\n");
+    // Initialization
+        M = 8'000;
+        N = 12'000;
+        NLOOPS = 30;
+        auto [A,x] = init_B(M,N);
+    // Benchmark
+        tic();
+        benchmark_B(A, x, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*N + M + N;
+        size_t flops  = 2 * M * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //     Matrix-Matrix product
+    printf("----- Benchmark (C) -----\n");
+    // Initialization
+        M = 1'000;
+        N = 2'000;
+        L = 500;
+        NLOOPS = 20;
+        auto [A,B] = init_C(M,N,L);
+    // Benchmark
+        tic();
+        benchmark_C(A, B, L, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*L + L*N + M*N;
+        size_t flops  = M * 2*L * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    if (cblas == false)
+    { //      Polynomial evaluation
+    printf("----- Benchmark (D) -----\n");
+    // Initialization
+        N = 1'000'000;
+        p = 200;
+        NLOOPS = 20;
+        auto [x,a] = init_D(N,p);
+    // Benchmark
+        tic();
+        benchmark_D(x, a, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings  and Performance
+        size_t memory = 2.0 * N;
+        size_t flops  = 2.0 * N * p;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+}
+
+void task_5() {
+    printf("\n\n-------------- Task 5 --------------\n\n");
+
+    printf("----- Benchmark norm -----\n");
+    // Initialization
+        size_t N =50'000'000;
+        size_t NLOOPS = 50;
+        vector<double> x = init_norm(N);
+    // Benchmark
+        tic();
+        benchmark_norm(x, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = N;
+        size_t flops  = 2 * N;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    printf("What do you observe? Why?\n");
+    printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
+}
+
+void task_6() {
+    printf("\n\n-------------- Task 6 --------------\n\n");
+    printf("Benchmarks using cBLAS\n");
+    task_4(true);
+}
+
+void task_7() {
+    printf("\n\n-------------- Task 7 --------------\n\n");
+    { // Check Ax=b
+    size_t N=5, Nrhs=2;
+    auto [A,b] = init_M(N,Nrhs);
+    vector<double> A_og = A;
+
+    printf("A =");
+    print_matrix(A,N,N);
+    printf("b =");
+    print_matrix(b,N,Nrhs);
+
+    int lda=N, ldb=Nrhs;
+    vector<int> ipiv(N);
+    LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
+    LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
+
+    printf("L + U =");
+    print_matrix(A,N,N);
+    printf("x =");
+    print_matrix(b,N,Nrhs);
+
+    int ldc=Nrhs;
+    vector<double> C(N*Nrhs);
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
+    printf("Check solution:\nA * x = ");
+    print_matrix(C,N,Nrhs);
+    }
+    // #################################
+    // Benchmark
+
+    cout << fixed << setprecision(4);  // 4 digits after decimal
+    size_t NLOOPS = 1000;
+    cout << "N    = " << "  | 1      | 2      | 4      | 8      | 16     | 32 " << endl;
+    cout <<       "---------|--------|--------|--------|--------|--------|-------" << endl;
+    for (int exp = 1; exp < 10; ++exp) {
+        cout << "Nrhs = " << static_cast<size_t>(pow(2,exp));
+        for (size_t N : {1, 2, 4, 8, 16, 32}) {
+            tic();
+            for (size_t i = 0; i < NLOOPS; ++i) {
+                benchmark_lapacke(N, static_cast<size_t>(pow(2,exp)));
+            }
+            double sec = toc();
+            cout << " | " << sec;
+        }
+        cout << endl;
+    }
+    printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected.");
+
+
+
+}
+
+int main() {
+    task_1();
+    task_2();
+    task_3();
+    task_4();
+    task_5();
+    task_6();
+    task_7();
+    printf("\n\n");
+
+    return 0;
+}
--- a/ex3/code/task_3.cpp
+++ b/ex3/code/task_3.cpp
@ -0,0 +1,61 @@
+#include "task_3.h"
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+
+double scalar(vector<double> const &x, vector<double> const &y) {
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
+    size_t const N = x.size();
+    size_t const M = A.size() / N;
+    vector<double> b(M);
+
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            b[i] += A[i*N + j] * x[j];
+        }
+    }
+    return b;
+}
+
+
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
+    size_t const L = A.size() / M;
+    size_t const N = B.size() / L;
+    vector<double> C(M*N,0);
+
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t k = 0; k < L; ++k) {
+            for (size_t j = 0; j < N; ++j) {
+                C[i*N + j] += A[i*L + k] * B[k*N + j];
+            }
+        }
+    }
+    return C;
+}
+
+
+vector<double> poly(vector<double> const &x, vector<double> const &a) {
+    size_t N = x.size();
+    size_t p = a.size();
+    vector<double> y(N);
+    for (size_t i = 0; i < N; ++i) {
+        y[i] = a[p];
+        for (size_t k = 1; k < p; ++k) {
+            y[i] = y[i]*x[i] + a[p-k];
+        }
+    }
+    return y;
+}
--- a/ex3/code/task_3.h
+++ b/ex3/code/task_3.h
@ -0,0 +1,8 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y);
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
+vector<double> poly(vector<double> const &x, vector<double> const &a);
--- a/ex3/code/task_4+6.cpp
+++ b/ex3/code/task_4+6.cpp
@ -0,0 +1,148 @@
+#include "task_3.h"
+#include "task_4+6.h"
+#include "timing.h"
+#include <vector>
+#include <iostream>
+#include <cblas.h>               // cBLAS Library
+using namespace std;
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
+    printf("Memory allocated  : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
+    printf("Duration per loop : %.3f sec\n", sec);
+    printf("GFLOPS            : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
+    printf("GiByte/s          : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
+}
+
+tuple<vector<double>, vector<double>> init_A(size_t N) {
+    vector<double> x(N), y(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+        y[i] = 1.0 / x[i];
+    }
+    return make_tuple(x, y);
+}
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+
+    double s(0.0), sum(0.0);
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = scalar(x, y);
+            sum += s;
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = cblas_ddot(N, x.data(), 1, y.data(), 1);
+            sum += s;
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
+    vector<double> A(M*N), x(N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            A[i*N + j] = (i+j)%219 + 1.0;
+        }
+    }
+    for (size_t j = 0; j < N; ++j) {
+        x[j] = 1.0/A[17*N + j];
+    }
+    return make_tuple(A, x);
+}
+
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+    size_t M = A.size() / N;
+    vector<double> b(M);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            b = matrix_vec(A,x);
+            sum += b[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
+            sum += b[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
+    vector<double> A(M*L), B(L*N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < L; ++j) {
+            A[i*L + j] = (i+j)%219 + 1.0;
+        }
+    }
+    // B chosen such that C[0,17]=L
+    // so B[i,17] = 1/A[0,i]
+    for (size_t i = 0; i < L; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (j==17) {
+                B[i*N + 17] = 1.0/A[i];
+            } else {
+                B[i*N + j] = (i+j)%219 + 1.0;
+            }
+        }
+    }
+    return make_tuple(A, B);
+}
+
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
+    size_t M = A.size() / L;
+    size_t N = B.size() / L;
+    vector<double> C(M*N);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            C = matrix_matrix(A,B,M);
+            sum += C[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+            sum += C[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != L*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
+    // x_i = i/N for i=0,...,N-1
+    // a_j = 1   for j=0,...,p-1
+    vector<double> x(N), a(p);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = static_cast<double>(i) / N;
+    }
+    for (size_t j = 0; j < p; ++j) {
+        a[j] = 1.0;
+    }
+    return make_tuple(x, a);
+}
+
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
+    size_t N = x.size();
+    vector<double> y(N);
+    double sum(0.0);
+
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        y = poly(x,a);
+        sum += y[0];
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != NLOOPS) {printf("  !!   W R O N G  result sum = %f  !!\n", sum);}
+}
--- a/ex3/code/task_4+6.h
+++ b/ex3/code/task_4+6.h
@ -0,0 +1,14 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
+tuple<vector<double>, vector<double>> init_A(size_t N);
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
--- a/ex3/code/task_5.cpp
+++ b/ex3/code/task_5.cpp
@ -0,0 +1,33 @@
+#include "task_4+6.h"
+#include "task_5.h"
+#include "timing.h"
+#include <vector>
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+double norm(vector<double> const &x) {
+    size_t N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * x[i];
+    }
+    return sqrt(sum);
+}
+
+vector<double> init_norm(size_t N) {
+    vector<double> x(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+    }
+    return x;
+}
+
+void benchmark_norm(vector<double> const &x, size_t NLOOPS) {
+    double s(0.0), sum(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        s = norm(x);
+        sum += s;
+    }
+    printf("||x|| = %f\n", sum/NLOOPS);
+}
--- a/ex3/code/task_5.h
+++ b/ex3/code/task_5.h
@ -0,0 +1,8 @@
+#pragma once
+
+#include <vector>
+using namespace std;
+
+double norm(vector<double> const &x);
+vector<double> init_norm(size_t N);
+void benchmark_norm(vector<double> const &x, size_t NLOOPS);
--- a/ex3/code/task_7.cpp
+++ b/ex3/code/task_7.cpp
@ -0,0 +1,43 @@
+#include "task_7.h"
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <lapacke.h>
+using namespace std;
+
+tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs) {
+    vector<double> A(N*N), b(N*Nrhs);
+    for (size_t i = 0; i < N; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (i != j) {
+                A[i*N + j] = 1.0 / pow(abs(1.0*i-1.0*j),2);
+            } else {
+                A[i*N + j] = 4;
+            }
+        }
+        for (size_t j=0; j < Nrhs; ++j) {
+            b[i*Nrhs + j] = 1.0*j;
+        }
+    }
+    return make_tuple(A, b);
+}
+
+void print_matrix(vector<double> &A, size_t M, size_t N) {
+    printf("\n");
+    for (size_t i = 0; i < M; ++i){
+        for (size_t j = 0; j < N; ++j) {
+            printf("%f ", A[i*N + j]);
+        }
+        printf("\n");
+    }
+    printf("\n\n");
+}
+
+void benchmark_lapacke(int N, int Nrhs) {
+    auto [A,b] = init_M(N,Nrhs);
+    int lda=N, ldb=Nrhs;
+    vector<int> ipiv(N);
+    LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
+    LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
+}
--- a/ex3/code/task_7.h
+++ b/ex3/code/task_7.h
@ -0,0 +1,7 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs);
+void print_matrix(vector<double> &A, size_t M, size_t N);
+void benchmark_lapacke(int N, int Nrhs);
--- a/ex3/code/timing.h
+++ b/ex3/code/timing.h
@ -0,0 +1,51 @@
+//
+//    Gundolf Haase, Oct 18 2024
+//
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+//using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining can be nested and the recent time point is stored on top of the stack.
+ * 
+ *  @return recent time point
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time point from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}
--- a/ex3/ex_3.pdf
+++ b/ex3/ex_3.pdf
--- a/ex3/stream/CLANG_default.mk
+++ b/ex3/stream/CLANG_default.mk
@ -0,0 +1,124 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=CLANG_
+
+#CLANGPATH=//usr/lib/llvm-10/bin/
+CC     = ${CLANGPATH}clang
+CXX    = ${CLANGPATH}clang++
+#CXX   = ${CLANGPATH}clang++ -lomptarget  -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
+#F77   = gfortran
+LINKER = ${CXX}
+
+#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
+WARNINGS += -Weverything 
+WARNINGS += -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded 
+WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic -ferror-limit=1
+#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+
+CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
+# don't use -Ofast
+# -ftrapv
+LINKFLAGS += -O3
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+ifeq ($(UBUNTU),1)
+# ubuntu
+else
+# on  archlinux
+LINKFLAGS += -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+#   very good check
+# http://clang.llvm.org/extra/clang-tidy/
+#   good check, see:  http://llvm.org/docs/CodingStandards.html#include-style
+SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
+SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
+#READABILITY=,readability*${SWITCH_OFF}
+#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+#TIDYFLAGS += -checks='modernize*
+#   ???
+#TIDYFLAGS = -checks='cert*'  -header-filter=.*
+#   MPI checks ??
+#TIDYFLAGS = -checks='mpi*'
+#   ??
+#TIDYFLAGS = -checks='performance*'   -header-filter=.*
+#TIDYFLAGS = -checks='portability-*'  -header-filter=.*
+#TIDYFLAGS = -checks='readability-*'  -header-filter=.*
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+codecheck: tidy_check
+tidy_check:
+	clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
+# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+codecheck: tidy_check
--- a/ex3/stream/GCC_default.mk
+++ b/ex3/stream/GCC_default.mk
@ -0,0 +1,183 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+#LINKFLAGS   += -lblas
+# The <cblas.h> header requires extern "C".
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#  -Wunreachable-code
+#CXXFLAGS += -ffast-math -march=native ${WARNINGS}
+CXXFLAGS += -O3 -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+#CPU = -march=znver2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+ifeq ($(UBUNTU),1)
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+else
+# on  archlinux
+LINKFLAGS += -llapack -lopenblas -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM}
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS} 11
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/ex3/stream/ICC_default.mk
+++ b/ex3/stream/ICC_default.mk
@ -0,0 +1,125 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ICC_
+
+#BINDIR = /opt/intel/bin/
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}icpc
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+
+WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3
+#    -Winline -Wredundant-decls -Wunreachable-code
+CXXFLAGS +=  -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+# -vec-report=3
+# -qopt-report=5 -qopt-report-phase=vec
+# -guide -parallel
+# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+# -auto-p32 -simd -msse3
+
+LINKFLAGS += -O3
+
+# LAPACK, BLAS: use MKL by INTEL
+# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+CXXFLAGS  += -mkl
+LINKFLAGS += -mkl
+
+# interprocedural optimization
+#CXXFLAGS  += -ipo
+#LINKFLAGS += -ipo
+
+# annotated assembler file
+ANNOTED = -fsource-asm -S
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# # Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# cache: ${PROGRAM}
+# 	valgrind --tool=callgrind --simulate-cache=yes ./$^
+# #	kcachegrind callgrind.out.<pid> &
+#
+# # Check for wrong memory accesses, memory leaks, ...
+# # use smaller data sets
+# mem: ${PROGRAM}
+# 	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
+#
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g -pg
+# #  LINKFLAGS += -pg
+# prof: ${PROGRAM}
+# 	./$^
+# 	gprof -b ./$^ > gp.out
+# #	kprof -f gp.out -p gprof &
+#
+
+
+mem: inspector
+prof: amplifier
+cache: amplifier
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	alternatively to the solution abouve:
+            #edit file  /etc/sysctl.d/10-ptrace.conf     and set variable   kernel.yama.ptrace_scope   variable to 0 .
+	vtune-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	inspxe-gui &
+
+advisor:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	vtune-gui &
+
+icc-info:
+	icpc -# main.cpp
+
+
+
+
--- a/ex3/stream/ONEAPI_default.mk
+++ b/ex3/stream/ONEAPI_default.mk
@ -0,0 +1,176 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ONEAPI_
+
+#         https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# requires
+# source /opt/intel/oneapi/setvars.sh
+# on  AMD:    export MKL_DEBUG_CPU_TYPE=5
+
+#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
+#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
+#export KMP_AFFINITY=verbose,compact
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}dpcpp
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+## Compiler flags
+WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
+WARNINGS += -Wpessimizing-move -Wredundant-move
+#-wd2015,2012,2014 -wn3
+#    -Winline -Wredundant-decls -Wunreachable-code
+# -qopt-subscript-in-range
+# -vec-threshold0
+
+CXXFLAGS += -O3 -std=c++17  ${WARNINGS}
+#CXXFLAGS += -DMKL_ILP64  -I"${MKLROOT}/include"
+#CXXFLAGS += -DMKL_ILP32  -I"${MKLROOT}/include"
+LINKFLAGS += -O3
+
+# interprocedural optimization
+CXXFLAGS  += -ipo
+LINKFLAGS += -ipo
+LINKFLAGS += -flto
+
+# annotated Assembler file
+ANNOTED = -fsource-asm -S 
+
+#architecture
+CPU  = -march=core-avx2
+#CPU += -mtp=zen
+# -xCORE-AVX2
+# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# use MKL by INTEL
+# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# sequential MKL
+#                              use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
+CXXFLAGS +=  -qmkl=sequential  -UMKL_ILP64
+LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+
+# shared libs:  https://aur.archlinux.org/packages/intel-oneapi-compiler-static
+#     install intel-oneapi-compiler-static   
+# or 
+LINKFLAGS += -shared-intel
+
+
+OPENMP = -qopenmp
+CXXFLAGS += ${OPENMP}
+LINKFLAGS += ${OPENMP}
+
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+# -vec-report=3
+# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
+# -guide -parallel
+# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+# -auto-p32 -simd
+
+# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=vec,par
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=cg
+# Redirect report from *.optrpt to stderr
+#    -qopt-report-file=stderr
+# Guided paralellization
+#    -guide -parallel
+#    -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+#    -auto-p32 -simd
+
+## run time checks
+# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
+
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS} *.optrpt
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
+
+mem: inspector
+prof: vtune
+cache: inspector
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
+	amplxe-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	inspxe-gui &
+	vtune-gui ./${PROGRAM} &
+
+advisor:
+	source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
+#	/opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
+	advisor --collect=survey ./${PROGRAM} 
+#	advisor --collect=roofline ./${PROGRAM} 
+	advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
+
+vtune:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
+	export ADVIXE_EXPERIMENTAL=roofline
+	vtune -collect hotspots ./${PROGRAM}
+	vtune -report hotspots -r r000hs > vtune.out
+#	vtune-gui ./${PROGRAM} &	
+
+icc-info:
+	icpc -# main.cpp
+
+# MKL on AMD
+# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
+#
+# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
+# export MKL_DEBUG_CPU_TYPE=5
+# export MKL_NUM_THRAEDS=1
+# export MKL_DYNAMIC=false
+#  on Intel compiler
+# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
--- a/ex3/stream/PGI_default.mk
+++ b/ex3/stream/PGI_default.mk
@ -0,0 +1,94 @@
+# Basic Defintions for using PGI-compiler suite sequentially
+# requires setting of COMPILER=PGI_
+# OPTIRUN = optirun
+
+
+CC	= pgcc
+CXX     = pgc++
+F77	= pgfortran
+LINKER  = ${CXX}
+
+#LINKFLAGS   += -llapack -lblas
+# on mephisto:
+#CXXFLAGS  += -I/share/apps/atlas/include
+#LINKFLAGS += -L/share/apps/atlas/lib
+#LINKFLAGS   += -lcblas -latlas
+
+#LINKFLAGS   += -lblas
+# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
+
+WARNINGS = -Minform=warn
+# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow  -Wredundant-decls
+#           -pedantic -Wunreachable-code -Wextra -Winline
+#  -Wunreachable-code
+
+#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel
+PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
+# -Minfo
+# -Mprof=time
+# -Mprof=lines
+#        take care with option      -Msafeptr
+CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
+#CXXFLAGS += -O3 -std=c++11  -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
+#  -fastsse  -fargument-noalias ${WARNINGS}  -msse3 -vec-report=3
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g -pg
+# #  LINKFLAGS += -pg
+
+
+# Profiling options PGI, see: pgcollect -help
+# CPU_PROF = -allcache
+CPU_PROF = -time
+# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
+#GPU_PROF = -cuda=branch:cc20
+#
+PROF_FILE = pgprof.out
+
+cache: prof
+
+prof: ${PROGRAM}
+	${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^
+	${OPTIRUN} ${BINDIR}pgprof -exe ./$^  $(PROF_FILE) &
+
+info:
+	pgaccelinfo -v
--- a/ex3/stream/stream/HISTORY.txt
+++ b/ex3/stream/stream/HISTORY.txt
@ -0,0 +1,152 @@
+-------------------------------------------------------------------------
+
+Revisions as of Thu, Jan 17, 2013  3:50:01 PM
+
+Version 5.10 of stream.c has been released.
+This version includes improved validation code and will automatically
+use 64-bit array indices on 64-bit systems to allow very large arrays.
+
+-------------------------------------------------------------------------
+
+Revisions as of Thu Feb 19 08:16:57 CST 2009
+
+Note that the codes in the "Versions" subdirectory should be
+considered obsolete -- the versions of stream.c and stream.f
+in this main directory include the OpenMP directives and structure
+for creating "TUNED" versions.  
+
+Only the MPI version in the "Versions" subdirectory should be
+of any interest, and I have not recently checked that version for
+errors or compliance with the current versions of stream.c and
+stream.f.
+
+I added a simple Makefile to this directory.  It works under Cygwin
+on my Windows XP box (using gcc and g77).
+
+A user suggested a sneaky trick for "mysecond.c" -- instead of using
+the #ifdef UNDERSCORE to generate the function name that the Fortran
+compiler expects, the new version simply defines both "mysecond()"
+and "mysecond_()", so it should automagically link with most Fortran
+compilers.
+
+-------------------------------------------------------------------------
+
+Revisions as of Wed Nov 17 09:15:37 CST 2004
+
+The most recent "official" versions have been renamed "stream.f" and
+"stream.c" -- all other versions have been moved to the "Versions"
+subdirectory.
+
+The "official" timer (was "second_wall.c") has been renamed "mysecond.c".
+This is embedded in the C version ("stream.c"), but still needs to be
+externally linked to the FORTRAN version ("stream.f").
+
+-------------------------------------------------------------------------
+
+Revisions as of Tue May 27 11:51:23 CDT 2003
+
+Copyright and License info added to stream_d.f, stream_mpi.f, and
+stream_tuned.f
+
+
+-------------------------------------------------------------------------
+
+Revisions as of Tue Apr  8 10:26:48 CDT 2003
+
+I changed the name of the timer interface from "second" to "mysecond"
+and removed the dummy argument in all versions of the source code (but
+not the "Contrib" versions).
+
+
+-------------------------------------------------------------------------
+
+Revisions as of Mon Feb 25 06:48:14 CST 2002
+
+Added an OpenMP version of stream_d.c, called stream_d_omp.c.  This is
+still not up to date with the Fortran version, which includes error
+checking and advanced data flow to prevent overoptimization, but it is
+a good start....
+
+
+-------------------------------------------------------------------------
+
+Revisions as of Tue Jun  4 16:31:31 EDT 1996
+
+I have fixed an "off-by-one" error in the RMS time calculation in
+stream_d.f.  This was already corrected in stream_d.c.  No results are
+invalidated, since I use minimum time instead of RMS time anyway....
+
+-------------------------------------------------------------------------
+
+Revisions as of Fri Dec  8 14:49:56 EST 1995
+
+I have renamed the timer routines to:
+	second_cpu.c
+	second_wall.c
+	second_cpu.f
+
+All have a function interface named 'second' which returns a double
+precision floating point number.  It should be possible to link
+second_wall.c with stream_d.f without too much trouble, though the
+details will depend on your environment.
+
+If anyone builds versions of these timers for machines running the
+Macintosh O/S or DOS/Windows, I would appreciate getting a copy.
+
+To clarify:
+  * For single-user machines, the wallclock timer is preferred.
+  * For parallel machines, the wallclock timer is required.
+  * For time-shared systems, the cpu timer is more reliable,
+        though less accurate.
+    
+
+-------------------------------------------------------------------------
+
+Revisions as of Wed Oct 25 09:40:32 EDT 1995
+
+(1) NOTICE to C users:
+
+    stream_d.c has been updated to version 4.0 (beta), and
+    should be functionally identical to stream_d.f
+
+    Two timers are provided --- second_cpu.c and second_wall.c
+    second_cpu.c measures cpu time, while second_wall.c measures
+    elapsed (real) time.   
+
+    For single-user machines, the wallclock timer is preferred.
+    For parallel machines, the wallclock timer is required.
+    For time-shared systems, the cpu timer is more reliable,
+    though less accurate.
+    
+(2) cstream.c has been removed -- use stream_d.c
+
+(3) stream_wall.f has been removed --- to do parallel aggregate
+    bandwidth runs, comment out the definition of FUNCTION SECOND
+    in stream_d.f and compile/link with second_wall.c
+
+(4) stream_offset has been deprecated.  It is still here
+    and usable, but stream_d.f is the "standard" version.
+    There are easy hooks in stream_d.f to change the
+    array offsets if you want to.
+
+(5) The rules of the game are clarified as follows:
+
+    The reference case uses array sizes of 2,000,000 elements
+    and no additional offsets.  I would like to see results
+    for this case.
+
+    But, you are free to use any array size and any offset
+    you want, provided that the arrays are each bigger than
+    the last-level of cache.  The output will show me what
+    parameters you chose.
+
+    I expect that I will report just the best number, but
+    if there is a serious discrepancy between the reference
+    case and the "best" case, I reserve the right to report 
+    both.
+
+    Of course, I also reserve the right to reject any results
+    that I do not trust....
+--
+John D. McCalpin, Ph.D.        
+john@mccalpin.com
--- a/ex3/stream/stream/LICENSE.txt
+++ b/ex3/stream/stream/LICENSE.txt
@ -0,0 +1,34 @@
+*=======================================================================
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
--- a/ex3/stream/stream/Makefile
+++ b/ex3/stream/stream/Makefile
@ -0,0 +1,44 @@
+CC = gcc
+CFLAGS = -O3 
+DIMENSIONS = -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 
+
+FF = gfortran
+FFLAGS = -O3
+
+all: stream_f.exe stream_c.exe flops.exe
+
+stream_f.exe: stream.f mysecond.o
+	$(CC) $(CFLAGS) -c mysecond.c
+	$(FF) $(FFLAGS) $(DIMENSIONS)  -c stream.f
+	$(FF) $(FFLAGS) stream.o mysecond.o -o stream_f.exe
+
+stream_c.exe: stream.c
+	$(CC) $(CFLAGS) $(DIMENSIONS) stream.c -o stream_c.exe
+
+clean:
+	rm -f *.exe *.o
+
+# an example of a more complex build line for the Intel icc compiler
+stream.icc: stream.c
+	icc -O3 -xCORE-AVX2 -ffreestanding -qopenmp -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream.omp.AVX2.80M.20x.icc
+
+# GH
+flops.exe:  
+	$(CC) $(CFLAGS) -DUNIX flops.c -o flops.exe
+
+run: clean all
+	./stream_c.exe
+	./flops.exe
+
+MY_DIR = `basename ${PWD}`
+tar: clean
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+
+zip: clean
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk;\
+	cd ${MY_DIR}
+# HG
--- a/ex3/stream/stream/READ.ME
+++ b/ex3/stream/stream/READ.ME
@ -0,0 +1,110 @@
+===============================================
+
+STREAM is the de facto industry standard benchmark
+for measuring sustained memory bandwidth.
+
+Documentation for STREAM is on the web at:
+   http://www.cs.virginia.edu/stream/ref.html
+
+===============================================
+NEWS
+===============================================
+UPDATE: October 28 2014:
+
+"stream_mpi.c" released in the Versions directory.
+
+Based on Version 5.10 of stream.c, stream_mpi.c
+brings the following new features:
+* MPI implementation that *distributes* the arrays
+  across all MPI ranks. (The older Fortran version
+  of STREAM in MPI *replicates* the arrays across
+  all MPI ranks.)
+* Data is allocated using "posix_memalign" 
+  rather than using static arrays.  Different
+  compiler flags may be needed for both portability
+  and optimization.
+  See the READ.ME file in the Versions directory
+  for more details.
+* Error checking and timing done by all ranks and
+  gathered by rank 0 for processing and output.
+* Timing code uses barriers to ensure correct
+  operation even when multiple MPI ranks run on
+  shared memory systems.
+
+NOTE: MPI is not a preferred implementation for
+  STREAM, which is intended to measure memory
+  bandwidth in shared-memory systems.  In stream_mpi,
+  the MPI calls are only used to properly synchronize
+  the timers (using MPI_Barrier) and to gather
+  timing and error data, so the performance should 
+  scale linearly with the size of the cluster.
+  But it may be useful, and was an interesting 
+  exercise to develop and debug.
+
+===============================================
+UPDATE: January 17 2013:
+
+Version 5.10 of stream.c is finally available!
+
+There are no changes to what is being measured, but
+a number of long-awaited improvements have been made:
+
+* Updated validation code does not suffer from 
+  accumulated roundoff error for large arrays.
+* Defining the preprocessor variable "VERBOSE"
+  when compiling will (1) cause the code to print the
+  measured average relative absolute error (rather than
+  simply printing "Solution Validates", and (2) print
+  the first 10 array entries with relative error exceeding
+  the error tolerance.
+* Array index variables have been upgraded from
+  "int" to "ssize_t" to allow arrays with more
+  than 2 billion elements on 64-bit systems.
+* Substantial improvements to the comments in 
+  the source on how to configure/compile/run the
+  benchmark.
+* The proprocessor variable controlling the array
+  size has been changed from "N" to "STREAM_ARRAY_SIZE".
+* A new preprocessor variable "STREAM_TYPE" can be
+  used to override the data type from the default
+  "double" to "float".
+  This mechanism could also be used to change to 
+  non-floating-point types, but several "printf"
+  statements would need to have their formats changed
+  to accomodate the modified data type.
+* Some small changes in output, including printing
+  array sizes is GiB as well as MiB.
+* Change to the default output format to print fewer
+  decimals for the bandwidth and more decimals for
+  the min/max/avg execution times.
+
+
+===============================================
+UPDATE: February 19 2009:
+
+The most recent "official" versions have been renamed
+"stream.f" and "stream.c" -- all other versions have
+been moved to the "Versions" subdirectory and should be
+considered obsolete.
+
+The "official" timer (was "second_wall.c") has been
+renamed "mysecond.c".   This is embedded in the C version
+("stream.c"), but still needs to be externally linked to
+the FORTRAN version ("stream.f").  The new version defines
+entry points both with and without trailing underscores,
+so it *should* link automagically with any Fortran compiler.
+
+===============================================
+
+STREAM is a project of "Dr. Bandwidth":
+	John D. McCalpin, Ph.D.
+	john@mccalpin.com
+
+===============================================
+
+The STREAM web and ftp sites are currently hosted at
+the Department of Computer Science at the University of
+Virginia under the generous sponsorship of Professor Bill
+Wulf and Professor Alan Batson.
+
+===============================================
--- a/ex3/stream/stream/flops.c
+++ b/ex3/stream/stream/flops.c
--- a/ex3/stream/stream/flops.exe
+++ b/ex3/stream/stream/flops.exe
--- a/ex3/stream/stream/mysecond.c
+++ b/ex3/stream/stream/mysecond.c
@ -0,0 +1,27 @@
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.
+
+   This version defines two entry points -- with 
+   and without appended underscores, so it *should*
+   automagically link with FORTRAN */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+/* struct timeval { long        tv_sec;
+            long        tv_usec;        };
+
+struct timezone { int   tz_minuteswest;
+             int        tz_dsttime;      };     */
+
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+double mysecond_() {return mysecond();}
+
--- a/ex3/stream/stream/stream.c
+++ b/ex3/stream/stream/stream.c
@ -0,0 +1,585 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	10000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	10
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	10
+#endif
+
+/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
+ *         relative alignment of the arrays (though compilers may change the 
+ *         effective offset by making the arrays non-contiguous on some systems). 
+ *      Use of non-zero values for OFFSET can be especially helpful if the
+ *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
+ *      OFFSET can also be set on the compile line without changing the source
+ *         code using, for example, "-DOFFSET=56".
+ */
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+			b[STREAM_ARRAY_SIZE+OFFSET],
+			c[STREAM_ARRAY_SIZE+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    int			k;
+    ssize_t		j;
+    STREAM_TYPE		scalar;
+    double		t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(STREAM_TYPE);
+    printf("This system uses %d bytes per array element.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+#ifdef N
+    printf("*****  WARNING: ******\n");
+    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+    printf("*****  WARNING: ******\n");
+#endif
+
+    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
+    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+    printf("Each kernel will be executed %d times.\n", NTIMES);
+    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+    printf(" will be used to compute the reported bandwidth.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+#ifdef _OPENMP
+	k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+		k++;
+    printf ("Number of Threads counted = %i\n",k);
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+	    a[j] = 1.0;
+	    b[j] = 2.0;
+	    c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
+		a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void checkSTREAMresults ()
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+    /* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+        for (j=0; j<STREAM_ARRAY_SIZE; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif
--- a/ex3/stream/stream/stream.f
+++ b/ex3/stream/stream/stream.f
@ -0,0 +1,462 @@
+*=======================================================================
+* Program: STREAM
+* Programmer: John D. McCalpin
+* RCS Revision: $Id: stream.f,v 5.6 2005/10/04 00:20:48 mccalpin Exp mccalpin $
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+* This program measures sustained memory transfer rates in MB/s for
+* simple computational kernels coded in FORTRAN.
+*
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=======================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+      INTEGER n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+!$    INTEGER omp_get_num_threads
+!$    EXTERNAL omp_get_num_threads
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      PRINT *,'----------------------------------------------'
+      PRINT *,'STREAM Version $Revision: 5.6 $'
+      PRINT *,'----------------------------------------------'
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+!$OMP PARALLEL
+!$OMP MASTER
+      PRINT *,'----------------------------------------------'
+!$    PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
+!$OMP END MASTER
+!$OMP END PARALLEL
+
+      PRINT *,'----------------------------------------------'
+!$OMP PARALLEL
+      PRINT *,'Printing one line per active thread....'
+!$OMP END PARALLEL
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 30 j = 1,n
+              c(j) = a(j)
+   30     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = mysecond()
+          c(1) = c(1) + t
+!$OMP PARALLEL DO
+          DO 40 j = 1,n
+              b(j) = scalar*c(j)
+   40     CONTINUE
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 50 j = 1,n
+              c(j) = a(j) + b(j)
+   50     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = mysecond()
+          b(1) = b(1) + t
+!$OMP PARALLEL DO
+          DO 60 j = 1,n
+              a(j) = b(j) + scalar*c(j)
+   60     CONTINUE
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-1)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+
--- a/ex3/stream/stream/stream_c.exe
+++ b/ex3/stream/stream/stream_c.exe
--- a/ex3/stream/stream/stream_f.exe
+++ b/ex3/stream/stream/stream_f.exe