ex3

2025-11-11 13:36:28 +01:00 · 2025-11-11 13:36:28 +01:00 · 7a02dff345
commit 7a02dff345
parent f0cd37fddc
29 changed files with 3943 additions and 1 deletions
--- a/ex3/code/Makefile
+++ b/ex3/code/Makefile
@ -0,0 +1,30 @@
+PROGRAM	= main
+
+SOURCES = $(wildcard *.cpp)
+OBJECTS = ${SOURCES:.cpp=.o}
+
+CXX     = g++
+LINKER  = g++
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+
+CXXFLAGS   = -g -flto -O3 -ffast-math -march=native ${WARNINGS}
+LINKFLAGS  = -g -flto -O3 -lopenblas -llapacke
+
+
+all: ${PROGRAM}
+
+# %.o: %.cpp
+# 	${CXX} ${CXXFLAGS} -c $< -o $@
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
+
+clean:
+	rm -f ${OBJECTS} ${PROGRAM}
+
+
+run: ${PROGRAM}
+# run: clean ${PROGRAM}
+	./${PROGRAM}
--- a/ex3/code/main.cpp
+++ b/ex3/code/main.cpp
@ -0,0 +1,267 @@
+
+#include "task_3.h"
+#include "task_4+6.h"
+#include "task_5.h"
+#include "task_7.h"
+#include "timing.h"
+
+#include <iomanip>
+#include <iostream>
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>
+
+void task_1() {
+    printf("\n\n-------------- Task 1 --------------\n\n");
+    printf("See comment in main.cpp");
+
+    // -------------------------------------------------------------
+    // STREAM version $Revision: 5.10 $
+    // -------------------------------------------------------------
+    // This system uses 8 bytes per array element.
+    // -------------------------------------------------------------
+    // Array size = 80000000 (elements), Offset = 0 (elements)
+    // Memory per array = 610.4 MiB (= 0.6 GiB).
+    // Total memory required = 1831.1 MiB (= 1.8 GiB).
+    // Each kernel will be executed 20 times.
+    //  The *best* time for each kernel (excluding the first iteration)
+    //  will be used to compute the reported bandwidth.
+    // -------------------------------------------------------------
+    // Your clock granularity/precision appears to be 1 microseconds.
+    // Each test below will take on the order of 116886 microseconds.
+    //    (= 116886 clock ticks)
+    // Increase the size of the arrays if this shows that
+    // you are not getting at least 20 clock ticks per test.
+    // -------------------------------------------------------------
+    // WARNING -- The above is only a rough guideline.
+    // For best results, please be sure you know the
+    // precision of your system timer.
+    // -------------------------------------------------------------
+    // Function    Best Rate MB/s  Avg time     Min time     Max time
+    // Copy:           29569.4     0.048585     0.043288     0.059164
+    // Scale:          17644.0     0.082248     0.072546     0.102548
+    // Add:            21030.1     0.100620     0.091298     0.124700
+    // Triad:          21230.7     0.100758     0.090435     0.120631
+    // -------------------------------------------------------------
+    // Solution Validates: avg error less than 1.000000e-13 on all three arrays
+    // -------------------------------------------------------------
+    // ./flops.exe
+
+    //    FLOPS C Program (Double Precision), V2.0 18 Dec 1992
+
+    //    Module     Error        RunTime      MFLOPS
+    //                             (usec)
+    //      1      4.0146e-13      0.0024   5827.9076
+    //      2     -1.4166e-13      0.0007  10037.8942
+    //      3      4.7184e-14      0.0039   4371.9185
+    //      4     -1.2557e-13      0.0034   4355.5711
+    //      5     -1.3800e-13      0.0066   4415.6439
+    //      6      3.2380e-13      0.0065   4441.6299
+    //      7     -8.4583e-11      0.0053   2277.1707
+    //      8      3.4867e-13      0.0069   4367.6094
+
+    //    Iterations      =  512000000
+    //    NullTime (usec) =     0.0000
+    //    MFLOPS(1)       =  7050.6178
+    //    MFLOPS(2)       =  3461.6233
+    //    MFLOPS(3)       =  4175.0442
+    //    MFLOPS(4)       =  4389.7311
+}
+
+void task_2() {
+    printf("\n\n-------------- Task 2 --------------\n\n");
+    printf("See comment in main.cpp");
+
+    // Memory needed (double 64-bit, 8 bytes):
+    // (A) (2N + 1) * 8 bytes
+    // (B) (M*N + M + N) * 8 bytes
+    // (C) (M*L + L*N + M*N) * 8 bytes
+    // (D) (N + N + p) * 8 bytes
+
+    // Floating point operations:
+    // (A) 2N
+    // (B) M * 2N
+    // (C) M * 2L * N
+    // (D) 2 * N * p (Horner Schema)
+
+    // Read/Write operations:
+    // (A) Read: 2N         Write: 1
+    // (B) Read: M*2N       Write: M*N
+    // (C) Read: M*2L*N     Write: M*L*N
+    // (D) Read: 2*N*p      Write: N*P
+}
+
+void task_3() {
+    printf("\n\n-------------- Task 3 --------------\n\n");
+    printf("Functions implemented in task_3.cpp");
+}
+
+void task_4(bool cblas = false) {
+    if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
+    size_t M, N, L, p, NLOOPS;
+
+    { //          Scalar product
+    printf("----- Benchmark (A) -----\n");
+    // Initialization
+        N = 50'000'000;
+        NLOOPS = 50;
+        auto [x,y] = init_A(N);
+    // Benchmark
+        tic();
+        benchmark_A(x, y, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = 2 * N;
+        size_t flops  = 2 * N;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+
+    { //      Matrix-Vector product
+    printf("----- Benchmark (B) -----\n");
+    // Initialization
+        M = 8'000;
+        N = 12'000;
+        NLOOPS = 30;
+        auto [A,x] = init_B(M,N);
+    // Benchmark
+        tic();
+        benchmark_B(A, x, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*N + M + N;
+        size_t flops  = 2 * M * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    { //     Matrix-Matrix product
+    printf("----- Benchmark (C) -----\n");
+    // Initialization
+        M = 1'000;
+        N = 2'000;
+        L = 500;
+        NLOOPS = 20;
+        auto [A,B] = init_C(M,N,L);
+    // Benchmark
+        tic();
+        benchmark_C(A, B, L, NLOOPS, cblas);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = M*L + L*N + M*N;
+        size_t flops  = M * 2*L * N;
+        print_performance(sec, memory, flops, sizeof(A[0]));
+    printf("-------------------------\n");
+    }
+
+    if (cblas == false)
+    { //      Polynomial evaluation
+    printf("----- Benchmark (D) -----\n");
+    // Initialization
+        N = 1'000'000;
+        p = 200;
+        NLOOPS = 20;
+        auto [x,a] = init_D(N,p);
+    // Benchmark
+        tic();
+        benchmark_D(x, a, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings  and Performance
+        size_t memory = 2.0 * N;
+        size_t flops  = 2.0 * N * p;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    }
+}
+
+void task_5() {
+    printf("\n\n-------------- Task 5 --------------\n\n");
+
+    printf("----- Benchmark norm -----\n");
+    // Initialization
+        size_t N =50'000'000;
+        size_t NLOOPS = 50;
+        vector<double> x = init_norm(N);
+    // Benchmark
+        tic();
+        benchmark_norm(x, NLOOPS);
+        double sec = toc() / NLOOPS;
+    // Timings and Performance
+        size_t memory = N;
+        size_t flops  = 2 * N;
+        print_performance(sec, memory, flops, sizeof(x[0]));
+    printf("-------------------------\n");
+    printf("What do you observe? Why?\n");
+    printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
+}
+
+void task_6() {
+    printf("\n\n-------------- Task 6 --------------\n\n");
+    printf("Benchmarks using cBLAS\n");
+    task_4(true);
+}
+
+void task_7() {
+    printf("\n\n-------------- Task 7 --------------\n\n");
+    { // Check Ax=b
+    size_t N=5, Nrhs=2;
+    auto [A,b] = init_M(N,Nrhs);
+    vector<double> A_og = A;
+
+    printf("A =");
+    print_matrix(A,N,N);
+    printf("b =");
+    print_matrix(b,N,Nrhs);
+
+    int lda=N, ldb=Nrhs;
+    vector<int> ipiv(N);
+    LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
+    LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
+
+    printf("L + U =");
+    print_matrix(A,N,N);
+    printf("x =");
+    print_matrix(b,N,Nrhs);
+
+    int ldc=Nrhs;
+    vector<double> C(N*Nrhs);
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
+    printf("Check solution:\nA * x = ");
+    print_matrix(C,N,Nrhs);
+    }
+    // #################################
+    // Benchmark
+
+    cout << fixed << setprecision(4);  // 4 digits after decimal
+    size_t NLOOPS = 1000;
+    cout << "N    = " << "  | 1      | 2      | 4      | 8      | 16     | 32 " << endl;
+    cout <<       "---------|--------|--------|--------|--------|--------|-------" << endl;
+    for (int exp = 1; exp < 10; ++exp) {
+        cout << "Nrhs = " << static_cast<size_t>(pow(2,exp));
+        for (size_t N : {1, 2, 4, 8, 16, 32}) {
+            tic();
+            for (size_t i = 0; i < NLOOPS; ++i) {
+                benchmark_lapacke(N, static_cast<size_t>(pow(2,exp)));
+            }
+            double sec = toc();
+            cout << " | " << sec;
+        }
+        cout << endl;
+    }
+    printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected.");
+
+
+
+}
+
+int main() {
+    task_1();
+    task_2();
+    task_3();
+    task_4();
+    task_5();
+    task_6();
+    task_7();
+    printf("\n\n");
+
+    return 0;
+}
--- a/ex3/code/task_3.cpp
+++ b/ex3/code/task_3.cpp
@ -0,0 +1,61 @@
+#include "task_3.h"
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+
+double scalar(vector<double> const &x, vector<double> const &y) {
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
+    size_t const N = x.size();
+    size_t const M = A.size() / N;
+    vector<double> b(M);
+
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            b[i] += A[i*N + j] * x[j];
+        }
+    }
+    return b;
+}
+
+
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
+    size_t const L = A.size() / M;
+    size_t const N = B.size() / L;
+    vector<double> C(M*N,0);
+
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t k = 0; k < L; ++k) {
+            for (size_t j = 0; j < N; ++j) {
+                C[i*N + j] += A[i*L + k] * B[k*N + j];
+            }
+        }
+    }
+    return C;
+}
+
+
+vector<double> poly(vector<double> const &x, vector<double> const &a) {
+    size_t N = x.size();
+    size_t p = a.size();
+    vector<double> y(N);
+    for (size_t i = 0; i < N; ++i) {
+        y[i] = a[p];
+        for (size_t k = 1; k < p; ++k) {
+            y[i] = y[i]*x[i] + a[p-k];
+        }
+    }
+    return y;
+}
--- a/ex3/code/task_3.h
+++ b/ex3/code/task_3.h
@ -0,0 +1,8 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y);
+vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
+vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
+vector<double> poly(vector<double> const &x, vector<double> const &a);
--- a/ex3/code/task_4+6.cpp
+++ b/ex3/code/task_4+6.cpp
@ -0,0 +1,148 @@
+#include "task_3.h"
+#include "task_4+6.h"
+#include "timing.h"
+#include <vector>
+#include <iostream>
+#include <cblas.h>               // cBLAS Library
+using namespace std;
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
+    printf("Memory allocated  : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
+    printf("Duration per loop : %.3f sec\n", sec);
+    printf("GFLOPS            : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
+    printf("GiByte/s          : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
+}
+
+tuple<vector<double>, vector<double>> init_A(size_t N) {
+    vector<double> x(N), y(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+        y[i] = 1.0 / x[i];
+    }
+    return make_tuple(x, y);
+}
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+
+    double s(0.0), sum(0.0);
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = scalar(x, y);
+            sum += s;
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            s = cblas_ddot(N, x.data(), 1, y.data(), 1);
+            sum += s;
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
+    vector<double> A(M*N), x(N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            A[i*N + j] = (i+j)%219 + 1.0;
+        }
+    }
+    for (size_t j = 0; j < N; ++j) {
+        x[j] = 1.0/A[17*N + j];
+    }
+    return make_tuple(A, x);
+}
+
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
+    size_t N = x.size();
+    size_t M = A.size() / N;
+    vector<double> b(M);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            b = matrix_vec(A,x);
+            sum += b[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
+            sum += b[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != N*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
+    vector<double> A(M*L), B(L*N);
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < L; ++j) {
+            A[i*L + j] = (i+j)%219 + 1.0;
+        }
+    }
+    // B chosen such that C[0,17]=L
+    // so B[i,17] = 1/A[0,i]
+    for (size_t i = 0; i < L; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (j==17) {
+                B[i*N + 17] = 1.0/A[i];
+            } else {
+                B[i*N + j] = (i+j)%219 + 1.0;
+            }
+        }
+    }
+    return make_tuple(A, B);
+}
+
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
+    size_t M = A.size() / L;
+    size_t N = B.size() / L;
+    vector<double> C(M*N);
+    double sum(0.0);
+
+    if (cblas == false) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            C = matrix_matrix(A,B,M);
+            sum += C[17];
+        }
+    } else if (cblas == true) {
+        for (size_t i = 0; i < NLOOPS; ++i) {
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+            sum += C[17];
+        }
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != L*NLOOPS) {printf("  !!   W R O N G  result   !!\n");}
+}
+
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
+    // x_i = i/N for i=0,...,N-1
+    // a_j = 1   for j=0,...,p-1
+    vector<double> x(N), a(p);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = static_cast<double>(i) / N;
+    }
+    for (size_t j = 0; j < p; ++j) {
+        a[j] = 1.0;
+    }
+    return make_tuple(x, a);
+}
+
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
+    size_t N = x.size();
+    vector<double> y(N);
+    double sum(0.0);
+
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        y = poly(x,a);
+        sum += y[0];
+    }
+
+    // Check correctness
+    if (static_cast<size_t>(sum) != NLOOPS) {printf("  !!   W R O N G  result sum = %f  !!\n", sum);}
+}
--- a/ex3/code/task_4+6.h
+++ b/ex3/code/task_4+6.h
@ -0,0 +1,14 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
+tuple<vector<double>, vector<double>> init_A(size_t N);
+tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
+tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
+tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
+
+void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
+void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
+void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
+void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
--- a/ex3/code/task_5.cpp
+++ b/ex3/code/task_5.cpp
@ -0,0 +1,33 @@
+#include "task_4+6.h"
+#include "task_5.h"
+#include "timing.h"
+#include <vector>
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+double norm(vector<double> const &x) {
+    size_t N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+        sum += x[i] * x[i];
+    }
+    return sqrt(sum);
+}
+
+vector<double> init_norm(size_t N) {
+    vector<double> x(N);
+    for (size_t i = 0; i < N; ++i) {
+        x[i] = i%219 + 1.0;
+    }
+    return x;
+}
+
+void benchmark_norm(vector<double> const &x, size_t NLOOPS) {
+    double s(0.0), sum(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i) {
+        s = norm(x);
+        sum += s;
+    }
+    printf("||x|| = %f\n", sum/NLOOPS);
+}
--- a/ex3/code/task_5.h
+++ b/ex3/code/task_5.h
@ -0,0 +1,8 @@
+#pragma once
+
+#include <vector>
+using namespace std;
+
+double norm(vector<double> const &x);
+vector<double> init_norm(size_t N);
+void benchmark_norm(vector<double> const &x, size_t NLOOPS);
--- a/ex3/code/task_7.cpp
+++ b/ex3/code/task_7.cpp
@ -0,0 +1,43 @@
+#include "task_7.h"
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <lapacke.h>
+using namespace std;
+
+tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs) {
+    vector<double> A(N*N), b(N*Nrhs);
+    for (size_t i = 0; i < N; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (i != j) {
+                A[i*N + j] = 1.0 / pow(abs(1.0*i-1.0*j),2);
+            } else {
+                A[i*N + j] = 4;
+            }
+        }
+        for (size_t j=0; j < Nrhs; ++j) {
+            b[i*Nrhs + j] = 1.0*j;
+        }
+    }
+    return make_tuple(A, b);
+}
+
+void print_matrix(vector<double> &A, size_t M, size_t N) {
+    printf("\n");
+    for (size_t i = 0; i < M; ++i){
+        for (size_t j = 0; j < N; ++j) {
+            printf("%f ", A[i*N + j]);
+        }
+        printf("\n");
+    }
+    printf("\n\n");
+}
+
+void benchmark_lapacke(int N, int Nrhs) {
+    auto [A,b] = init_M(N,Nrhs);
+    int lda=N, ldb=Nrhs;
+    vector<int> ipiv(N);
+    LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
+    LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
+}
--- a/ex3/code/task_7.h
+++ b/ex3/code/task_7.h
@ -0,0 +1,7 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs);
+void print_matrix(vector<double> &A, size_t M, size_t N);
+void benchmark_lapacke(int N, int Nrhs);
--- a/ex3/code/timing.h
+++ b/ex3/code/timing.h
@ -0,0 +1,51 @@
+//
+//    Gundolf Haase, Oct 18 2024
+//
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+//using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining can be nested and the recent time point is stored on top of the stack.
+ * 
+ *  @return recent time point
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time point from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}