113 changed files with 12407 additions and 4620 deletions
--- a/CLANG_default.mk
+++ b/CLANG_default.mk
@ -0,0 +1,123 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=CLANG_
+
+#CLANGPATH=//usr/lib/llvm-10/bin/
+CC     = ${CLANGPATH}clang
+CXX    = ${CLANGPATH}clang++
+#CXX   = ${CLANGPATH}clang++ -lomptarget  -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
+#F77   = gfortran
+LINKER = ${CXX}
+
+#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
+WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
+WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+
+CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
+# don't use -Ofast
+# -ftrapv
+LINKFLAGS += -O3
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+ifeq ($(UBUNTU),1)
+# ubuntu
+else
+# on  archlinux
+LINKFLAGS += -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+#   very good check
+# http://clang.llvm.org/extra/clang-tidy/
+#   good check, see:  http://llvm.org/docs/CodingStandards.html#include-style
+SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
+SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
+#READABILITY=,readability*${SWITCH_OFF}
+#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+#TIDYFLAGS += -checks='modernize*
+#   ???
+#TIDYFLAGS = -checks='cert*'  -header-filter=.*
+#   MPI checks ??
+#TIDYFLAGS = -checks='mpi*'
+#   ??
+#TIDYFLAGS = -checks='performance*'   -header-filter=.*
+#TIDYFLAGS = -checks='portability-*'  -header-filter=.*
+#TIDYFLAGS = -checks='readability-*'  -header-filter=.*
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+codecheck: tidy_check
+tidy_check:
+	clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
+# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+codecheck: tidy_check
--- a/GCC_default.mk
+++ b/GCC_default.mk
@ -0,0 +1,196 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+#  -Wunreachable-code  -Winline
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+#CPU = -march=znver2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+ifeq ($(UBUNTU),1)
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+else
+# on  archlinux
+LINKFLAGS += -llapack -lopenblas -lcblas
+endif
+
+# interprocedural optimization
+#CXXFLAGS  += -flto
+#LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM}
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+#find . -size +10M > large_files
+#--exclude-from ${MY_DIR}/large_files
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+CXXFLAGS += -g -pg
+LINKFLAGS += -pg
+prof: ${PROGRAM}
+	./$^  ${PARAMS}
+	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#                   sudo apt install gprofng-gui
+#                   https://parallel.computer/presentations/PPoPP2023/2023-Ruud-Slides.pdf
+#   read §3 in      https://sourceware.org/binutils/docs/gprofng.html
+# /usr/bin/gp-collect-app -o test.1.er -p on -S on /home/ghaase/Lectures/Math2CPP/Codes/seq/jacobi_oo_stl/main.GCC_ 
+prof2: ${PROGRAM}
+	gprofng collect app -h auto ./$^  ${PARAMS}
+#	gprofng display text -functions `ls -1tdr test.*.er |tail -1`
+	gprofng display text -script gprofng_script2 `ls -1tdr test.*.er |tail -1`
+#	gprofng display text -script gprofng_script2 test.*.er
+#	gprofng display gui &
+	
+prof3: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS}
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/cpp_workspace/ex5/ex5_1/check_env.h
+++ b/cpp_workspace/ex5/ex5_1/check_env.h
@ -1,99 +0,0 @@
-#pragma once
-
-#include <iostream>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include <unordered_map>
-
-//#####################################
-// G.Haase
-// See https://sourceforge.net/p/predef/wiki/Compilers/
-//     http://www.cplusplus.com/doc/tutorial/preprocessor/
-//  also:  export OMP_DISPLAY_ENV=VERBOSE
-//#####################################
-/** 	Checks for compilers, its versions, threads etc.
- * 
-	@param[in] argc	number of command line arguemnts
-	@param[in] argv	command line arguments as array of C-strings
-*/
-template <class T>
-void check_env(T argc, char const *argv[])
-{
-    std::cout << "\n#######################################################################\n";
-    std::cout << "Code    :";
-    for (T k = 0; k < argc; ++k) std::cout << "  " << argv[k];
-    std::cout << std::endl;
-
-    // compiler:      https://sourceforge.net/p/predef/wiki/Compilers/
-    std::cout <<    "Compiler:  ";
-#if defined __INTEL_COMPILER
-#pragma message(" ##########  INTEL  ###############")
-    std::cout << "INTEL " << __INTEL_COMPILER;
-    // Ignore warnings for #pragma acc   unrecognice
-#pragma warning disable 161
-    // Ignore warnings for #pragma omp   unrecognice
-#pragma warning disable 3180
-
-#elif defined __PGI
-#pragma message(" ##########  PGI    ###############")
-    std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
-#elif defined  __clang__
-#pragma message(" ##########  CLANG    ###############")
-    std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
-#elif defined __GNUC__
-#pragma message(" ##########  Gnu    ###############")
-    std::cout << "Gnu " <<  __GNUC__  << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
-#else
-#pragma message(" ##########  unknown Compiler   ###############")
-    std::cout << "unknown";
-#endif
-    std::cout << "  C++ standard: " << __cplusplus << std::endl;
-
-    // Parallel environments
-    std::cout <<    "Parallel:  ";
-#if defined MPI_VERSION
-#pragma message(" ##########  MPI    ###############")
-#ifdef OPEN_MPI
-    std::cout << "OpenMPI ";
-#else
-    std::cout << "MPI ";
-#endif
-    std::cout << MPI_VERSION << "." << MPI_SUBVERSION << "   ";
-#endif
-
-#ifdef _OPENMP
-//https://www.openmp.org/specifications/
-//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
-    std::unordered_map<unsigned, std::string> const map{
-        {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
-#pragma message(" ##########  OPENMP    ###############")
-    //std::cout << _OPENMP;
-    std::cout << "OpenMP ";
-    try {
-        std::cout << map.at(_OPENMP);
-    }
-    catch (...) {
-        std::cout << _OPENMP;
-    }
-    #pragma omp parallel
-    {
-        #pragma omp master
-        {
-            const int nn = omp_get_num_threads();          // OpenMP
-            std::cout << " ---> " <<  nn << " Threads   ";
-        }
-        #pragma omp barrier
-    }
-
-#endif
-#ifdef _OPENACC
-#pragma message(" ##########  OPENACC    ###############")
-    std::cout << "OpenACC   ";
-#endif
-    std::cout << std::endl;
-    std::cout << "Date    :  " << __DATE__ << "  " << __TIME__;
-    std::cout << "\n#######################################################################\n";
-}
-// HG
-
--- a/cpp_workspace/ex5/ex5_1/main.cpp
+++ b/cpp_workspace/ex5/ex5_1/main.cpp
@ -1,142 +0,0 @@
-#include "check_env.h"
-#include "mylib.h"
-#include <cstdlib>          // atoi()
-#include <cstring>          // strncmp()
-#include <ctime>
-#include <iostream>
-#include <omp.h>            // OpenMP
-#include <sstream>
-#include <string>
-using namespace std;
-
-int main(int argc, char const *argv[])
-{
-    omp_set_schedule(omp_sched_static, 2000000);
-    //omp_set_schedule(omp_sched_dynamic, 1000000);
-    //omp_set_schedule(omp_sched_guided, 1000000);
-    //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
-
-
-    // Speedup for different number of cores (incl. hyperthreading)
-    omp_set_num_threads(8);
-
-    // Print number of available processors
-    cout << "Number of available processors: " << omp_get_num_procs() << endl;
-
-    // Currently executing parallel code? -> no
-    cout << "Currently in parallel? " << omp_in_parallel() << endl;
-
-
-    int const NLOOPS = 10;        // chose a value such that the benchmark runs at least 10 sec.
-    unsigned int N = 500000001;
-
-
-//##########################################################################
-//   Read Parameter from command line  (C++ style)
-    cout << "Checking command line parameters for: -n <number> " << endl;
-    for (int i = 1; i < argc; i++)
-    {
-        cout << " arg[" << i << "] = " << argv[i] << endl;
-        string ss(argv[i]);
-        if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
-        {
-            N = static_cast<unsigned int>(atoi(argv[i + 1]));
-        }
-        else
-        {
-            cout << "Corect call: " << argv[0] << " -n  <number>\n";
-        }
-    }
-
-    cout << "\nN = " << N << endl;
-    
-    check_env(argc, argv);
-//########################################################################
-    int nthreads;                                  // OpenMP
-    #pragma omp parallel default(none) shared(cout,nthreads)
-    {
-        stringstream inparallel;
-        inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
-
-
-        int const th_id  = omp_get_thread_num();   // OpenMP
-        int const nthrds = omp_get_num_threads();  // OpenMP
-        stringstream ss;
-        ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
-        #pragma omp critical
-        {
-            cout << ss.str();                      // output to a shared ressource
-            cout << inparallel.str() << endl;
-        }
-        #pragma omp master
-        nthreads = nthrds;                         // transfer nn to to master thread
-    }
-    cout << "   " << nthreads << "   threads have been started." << endl;
-
-//##########################################################################
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> x(N), y(N);
-
-    cout.precision(2);
-    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-//##########################################################################
-//  Data initialization
-//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
-    for (unsigned int i = 0; i < N; ++i)
-    {
-        x[i] = i + 1;
-        y[i] = 1.0 / x[i];
-    }
-
-//##########################################################################
-    cout << "\nStart Benchmarking\n";
-
-// Do calculation
-    double tstart = omp_get_wtime();                  // OpenMP
-
-    double sk(0.0);
-    for (int i = 0; i < NLOOPS; ++i)
-    {
-        //sk = scalar(x, y);
-        sk = scalar_parallel(x, y);
-        //sk = scalar_trans(x, y);
-        //sk = norm(x);
-    }
-
-    double t1 = omp_get_wtime() - tstart;             // OpenMP
-
-    t1 /= NLOOPS;           // divide by number of function calls
-
-//##########################################################################
-// Check the correct result
-    cout << "\n <x,y> = " << sk << endl;
-    if (static_cast<unsigned int>(sk) != N)
-    {
-        cout << "  !!   W R O N G  result   !!\n";
-    }
-    cout << endl;
-
-//##########################################################################
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-    cout << "Total benchmarking time: " << t1*NLOOPS << endl;
-    cout << "Timing in sec. : " << t1 << endl;
-    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
-    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
-
-//#########################################################################
-
-    cout << "\n  Try the reduction with an STL-vektor \n";
-    
-    auto vr = reduction_vec_append(5);
-    cout << "done\n";
-    cout << vr << endl;
-
-
-    return 0;
-}  // memory for x and y will be deallocated their destructors
--- a/cpp_workspace/ex5/ex5_1/mylib.cpp
+++ b/cpp_workspace/ex5/ex5_1/mylib.cpp
@ -1,137 +0,0 @@
-#include "mylib.h"
-#include <cassert>       // assert()
-#include <cmath>
-#include <iostream>
-#include <functional>    // multiplies<>{}
-#include <list>
-#include <numeric>       // iota()
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include <vector>
-using namespace std;
-
-
-double scalar_parallel(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
-    {
-        const size_t nthreads = omp_get_num_threads();
-        const size_t threadnum = omp_get_thread_num();
-        const size_t chunksize = N/nthreads;
-
-
-        size_t start = threadnum*chunksize;
-        size_t end = start + chunksize;
-        if (threadnum == nthreads - 1)
-            end = N;
-
-
-        for (size_t i = start; i < end; ++i)
-        {
-            sum += x[i] * y[i];
-        }
-        
-    }
-    return sum;
-}
-
-
-vector<int> reduction_vec_append(int n)
-{ 
-    vector<int> vec(n);
-#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
-    {
-        #pragma omp barrier
-        #pragma omp critical
-        cout << omp_get_thread_num() << " : " << vec.size() << endl;
-        #pragma omp barrier
-        iota( vec.begin(),vec.end(), omp_get_thread_num() );
-        #pragma omp barrier
-
-    }
-    return vec;
-}
-
-
-
-
-
-
-
-
-
-
-
-double scalar(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i] * y[i];
-        //sum += exp(x[i])*log(y[i]);
-    }
-    return sum;
-}
-
-
-double norm(vector<double> const &x)
-{
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i]*x[i];
-    }
-    return sum;
-}
-
-
-vector<int> reduction_vec(int n)
-{ 
-    vector<int> vec(n);
-#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
-    {
-        #pragma omp barrier
-        #pragma omp critical
-        cout << omp_get_thread_num() << " : " << vec.size() << endl;
-        #pragma omp barrier
-        iota( vec.begin(),vec.end(), omp_get_thread_num() );
-        #pragma omp barrier
-
-    }
-    return vec;
-}
-
-
-double scalar_trans(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    vector<double> z(x.size());
-    //list<double> z(x.size());  // parallel for-loop on iterators not possible (missing 'operator-')
-                                 // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison 
-    
-    transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
-    
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (auto pi = cbegin(z); pi!=cend(z); ++pi)
-    {
-        sum += *pi;
-    }
-    //for (auto val: z)
-    //{
-        //sum += val;
-    //}    
-    return sum;
-}
-
-
-
--- a/cpp_workspace/ex5/ex5_1/mylib.h
+++ b/cpp_workspace/ex5/ex5_1/mylib.h
@ -1,88 +0,0 @@
-#pragma once
-#include <cassert>
-#include <iomanip>             // setw()
-#include <iostream>
-#include <omp.h>
-#include <vector>
-
-/** 	Inner product
-	@param[in] x	vector
-	@param[in] y	vector
-	@return 	    resulting Euclidian inner product <x,y>
-*/
-double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
-double scalar(std::vector<double> const &x, std::vector<double> const &y);
-double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
-
-
-
-// Declare additional reduction operation in OpenMP for STL-vector
-#pragma omp declare reduction(VecAppend : std::vector<int>  : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
- initializer (omp_priv=omp_orig)
- 
-std::vector<int> reduction_vec_append(int n);
-
-
-/** 	l2-norm
-	@param[in] x	vector
-	@return 	    resulting Euclidian norm
-*/
-double norm(std::vector<double> const &x);
-
-/** 	 Vector @p b adds its elements to vector @p a .
-	@param[in] a	vector
-	@param[in] b	vector
-	@return 	     a+=b componentwise
-*/
-template<class T>
-std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
-{
-    assert(a.size()==b.size());
-    for (size_t k = 0; k < a.size(); ++k) {
-        a[k] += b[k];
-    }
-    return a;
-}
-
-// Declare the reduction operation in OpenMP for an STL-vector
-//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
-// ------------------------------------------------------------
-// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
-// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
-// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
-//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
-#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
- initializer (omp_priv=omp_orig)
-
-//   Templates are  n o t  possible, i.e. the reduction has to be declared fore a specified type.
-//template <class T>
-//#pragma omp declare reduction(VecAdd : std::vector<T>  : omp_out += omp_in) initializer (omp_priv(omp_orig))
-// MS: template nach #pragma  !?
-
-// ------------------------------------------------------------
-
-/** 	Test for vector reduction.
- * 
- * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
- * Afterwards these vectors are accumulated, i.e., 
- * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
- * 
- * 	@param[in] n  size of global/private vector
- * 	@return  resulting global vector.
-*/
-std::vector<int> reduction_vec(int n);
-
-
-
-/** 	Output of a vector.
-	@param[in,out] s	output stream
-	@param[in]     x	vector
-	@return 	   modified output stream
-*/
-template <class T>
-std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
-{
-    for (auto const &v : x)  s << std::setw(4) << v << "  ";
-    return s;
-}
-
--- a/cpp_workspace/ex5/ex5_1/timing.h
+++ b/cpp_workspace/ex5/ex5_1/timing.h
@ -1,70 +0,0 @@
-#pragma once
-#include <chrono>                  // timing
-#include <stack>
-
-using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
-//using Clock = std::chrono::high_resolution_clock;
-using TPoint= std::chrono::time_point<Clock>;
-
-// [Galowicz, C++17 STL Cookbook, p. 29]
-inline 
-std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
-
-/** Starts stopwatch timer.
- *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
- * 
- *  The timining is allowed to be nested and the recent time is stored on top of the stack.
- * 
- *  @return recent time
- *  @see toc
- */
-inline auto tic()
-{
-    MyStopWatch.push(Clock::now());
-    return MyStopWatch.top();
-}
- 
-/** Returns the elapsed time from stopwatch. 
- * 
- * The time from top of the stack is used
- * if time point @p t_b is not passed as input parameter.
- * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
- * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
- * The last option is to be used in the case of 
- * non-nested but overlapping time measurements.
- * 
- * @param[in]  t_b start time of some stop watch
- * @return elapsed time in seconds.
- *
-*/ 
-inline double toc(TPoint const &t_b = MyStopWatch.top())
-{
-    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
-    using Unit      = std::chrono::seconds;
-    using FpSeconds = std::chrono::duration<double, Unit::period>;        
-    auto t_e = Clock::now();
-    MyStopWatch.pop();
-    return FpSeconds(t_e-t_b).count();
-}
-
-#include <iostream>
-#include <string>
-/** Executes function @p f and measures/prints elapsed wall clock time in seconds
- * 
- * Call as
- * @code measure("Time for (b = b + 1)", [&]() {
-        thrust::transform(b.begin(), b.end(), b.begin(), increment());
-    }); @endcode
- * 
- * @param[in]  label  additional string to be printed with the measurement.
- * @param[in]  f      function to execute.
- * @author Therese Bösmüller, 2025
- *
-*/ 
-auto measure = [](const std::string& label, auto&& f) {
-        auto start    = std::chrono::high_resolution_clock::now();
-        f();
-        auto stop     = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
-        std::cout << label << ": " << duration << " microseconds" << std::endl;
-};          // ';' is needed for a visible documentation of this lambda-function
--- a/cpp_workspace/ex5/ex5_2/main.GCC_
+++ b/cpp_workspace/ex5/ex5_2/main.GCC_
--- a/cpp_workspace/ex5/ex5_2/main.cpp
+++ b/cpp_workspace/ex5/ex5_2/main.cpp
@ -1,130 +0,0 @@
-#include "mylib.h"
-#include <fstream>
-#include <iostream>
-#include <omp.h>
-#include <vector>
-using namespace std;
-
-
-int main()
-{
-    // read vector from file
-    vector<size_t> data_vector = {};
-
-    ifstream input_stream("data_1.txt"); 
-
-    size_t line;
-    while(input_stream >> line)
-    {
-        data_vector.push_back(line);
-    }
-    data_vector.shrink_to_fit();
-
-
-
-
-    // specify loops
-    size_t NLOOPS = 10000;
-
-
-
-    // ############# Parallelization with openMP #############
-    // calculate arithmetic mean, geometric mean and harmonic mean
-    double am_omp, gm_omp, hm_omp;
-
-    double tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        means_omp(data_vector, am_omp, gm_omp, hm_omp);
-
-    double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
-
-    // calculate minimum and maximum
-    size_t min, max;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        minmax_omp(data_vector, min, max);
-
-    double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
-
-
-
-    
-
-
-    // ############# Parallelization with C++ algorithms #############
-    // calculate arithmetic mean, geometric mean and harmonic mean
-    double am_cpp, gm_cpp, hm_cpp;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
-
-    double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
-
-    // calculate minimum and maximum
-    size_t min_cpp, max_cpp;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        minmax_cpp(data_vector, min_cpp, max_cpp);
-
-    double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
-
-
-
-
-
-    // print results
-    cout << "####### OpenMP #######" << endl;
-    cout << "minimum: " << min << endl;
-    cout << "maximum: " << max << endl;
-    cout << "duration: " << t_minmax_omp << endl << endl;
-
-    cout << "arithmetic mean: " << am_omp << endl;
-    cout << "geometric mean: " << gm_omp << endl;
-    cout << "harmonic mean: " << hm_omp << endl;
-    cout << "duration: " << t_means_omp << endl << endl;
-
-
-    cout << "####### C++ #######" << endl;
-    cout << "minimum: " << min_cpp << endl;
-    cout << "maximum: " << max_cpp << endl;
-    cout << "duration: " << t_minmax_cpp << endl << endl;
-
-    cout << "arithmetic mean: " << am_cpp << endl;
-    cout << "geometric mean: " << gm_cpp << endl;
-    cout << "harmonic mean: " << hm_cpp << endl;
-    cout << "duration: " << t_means_cpp << endl << endl;
-
-
-
-    // ####### OpenMP #######
-    // minimum: 1
-    // maximum: 1000
-    // duration: 3.52086e-06
-
-    // arithmetic mean: 498.184
-    // geometric mean: 364.412
-    // harmonic mean: 95.6857
-    // duration: 5.90171e-06
-
-    // ####### C++ #######
-    // minimum: 1
-    // maximum: 1000
-    // duration: 1.76816e-05
-
-    // arithmetic mean: 498.184
-    // geometric mean: 364.412
-    // harmonic mean: 95.6857
-    // duration: 2.35728e-05
-
-    // --> the openMP variant is faster in both cases
-
-
-    return 0;
-}
--- a/cpp_workspace/ex5/ex5_2/main.o
+++ b/cpp_workspace/ex5/ex5_2/main.o
--- a/cpp_workspace/ex5/ex5_2/mylib.cpp
+++ b/cpp_workspace/ex5/ex5_2/mylib.cpp
@ -1,103 +0,0 @@
-#include "mylib.h"
-#include <algorithm>
-#include <cmath>
-#include <execution>
-#include <iostream>
-#include <numeric>
-#include <omp.h>
-
-using namespace std;
-
-
-void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
-{
-    size_t const n = numbers.size();
-
-    am = 0.;
-    gm = 0.;
-    hm = 0.;
-
-#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
-    for (size_t i = 0; i < n; ++i)
-    {
-        am += numbers[i];
-        gm += log(numbers[i]);
-        hm += 1.0/numbers[i];
-
-        // #pragma omp critical
-        // {
-        //     cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
-        // }
-    }
-
-    am /= n;
-    gm = exp(gm/n);
-    hm = n/hm;
-}
-
-
-void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
-{
-    size_t const n = numbers.size();
-
-    global_min = -1; // gives the maximum size_t value
-    global_max = 0;
-
-#pragma omp parallel shared(numbers, n, global_min, global_max)
-    {
-        const size_t nthreads = omp_get_num_threads();
-        const size_t threadnum = omp_get_thread_num();
-        const size_t chunksize = n/nthreads;
-
-
-        size_t start = threadnum*chunksize;
-        size_t end = start + chunksize;
-        if (threadnum == nthreads - 1)
-            end = n;
-
-
-        size_t local_min = -1;
-        size_t local_max = 0;
-        for (size_t i = start; i < end ; ++i)
-        {
-            if (numbers[i] < local_min)
-                local_min = numbers[i];
-
-            if (numbers[i] > local_max)
-                local_max = numbers[i];
-        }
-
-        #pragma omp critical
-        {
-            if (local_min < global_min)
-                global_min = local_min;
-            
-            if (local_max > global_max)
-                global_max = local_max;
-        }
-    }
-}
-
-
-void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
-{
-    size_t const n = numbers.size();
-
-    am = reduce(std::execution::par, numbers.begin(), numbers.end());
-    gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
-    hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
-
-    am /= n;
-    gm = exp(gm/n);
-    hm = n/hm;
-}
-
-
-void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
-{
-    auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
-    auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
-    
-    global_min = *min_it;
-    global_max = *max_it;
-}
--- a/cpp_workspace/ex5/ex5_2/mylib.h
+++ b/cpp_workspace/ex5/ex5_2/mylib.h
@ -1,42 +0,0 @@
-#include <vector>
-
-/**
-  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
-  Uses openMP parallelization.
-  @param[in]    numbers      vector containing integers
-  @param[out]   am           arithmetic mean
-  @param[out]   gm           geometric mean
-  @param[out]   hm           harmonic mean
-*/
-void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
-
-
-/**
-  This function calculates the minimum and maximum of a vector.
-  Uses openMP parallelization.
-  @param[in]    numbers         vector containing integers
-  @param[out]   global_min      minimum
-  @param[out]   global_max      maximum
-*/
-void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
-
-
-/**
-  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
-  Uses C++ parallelization.
-  @param[in]    numbers      vector containing integers
-  @param[out]   am           arithmetic mean
-  @param[out]   gm           geometric mean
-  @param[out]   hm           harmonic mean
-*/
-void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
-
-
-/**
-  This function calculates the minimum and maximum of a vector.
-  Uses C++ parallelization.
-  @param[in]    numbers         vector containing integers
-  @param[out]   global_min      minimum
-  @param[out]   global_max      maximum
-*/
-void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
--- a/cpp_workspace/ex5/ex5_2/mylib.o
+++ b/cpp_workspace/ex5/ex5_2/mylib.o
--- a/cpp_workspace/ex5/ex5_3/Makefile
+++ b/cpp_workspace/ex5/ex5_3/Makefile
@ -1,30 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp goldbach.cpp 
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-include ../${COMPILER}default.mk
--- a/cpp_workspace/ex5/ex5_3/goldbach.cpp
+++ b/cpp_workspace/ex5/ex5_3/goldbach.cpp
@ -1,46 +0,0 @@
-#include "goldbach.h"
-#include <iostream>
-#include <iterator>
-#include <omp.h>
-
-size_t single_goldbach(size_t k)
-{
-    const std::vector<size_t> relevant_primes = get_primes(k);
-    size_t m = relevant_primes.size();
-
-    size_t counter = 0;
-
-#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
-    for(size_t i = 0; i < m; ++i)
-    {
-        for(size_t j = i; j < m; ++j)
-        {
-            if(relevant_primes[i] + relevant_primes[j] == k)
-                ++counter;
-        }
-    }
-
-    return counter;
-}
-
-
-std::vector<size_t> count_goldbach(size_t n)
-{
-    const std::vector<size_t> relevant_primes = get_primes(n);
-    size_t m = relevant_primes.size();
-
-    std::vector<size_t> counter_vector(n + 1, 0);
-
-#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
-    for(size_t i = 0; i < m; ++i)
-    {
-        for(size_t j = i; j < m; ++j)
-        {
-            size_t sum = relevant_primes[i] + relevant_primes[j];
-            if(sum <= n)
-                ++counter_vector[relevant_primes[i] + relevant_primes[j]];
-        }
-    }
-
-    return counter_vector;
-}
--- a/cpp_workspace/ex5/ex5_3/goldbach.h
+++ b/cpp_workspace/ex5/ex5_3/goldbach.h
@ -1,45 +0,0 @@
-#pragma once
-#include "mayer_primes.h"
-#include <cassert>
-#include <vector>
-
-
-/**
-  This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
-  @param[in]    k            first integer
-  @param[out]   count        number of decompositions
-*/
-size_t single_goldbach(size_t k);
-
-
-/**
-  This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
-  @param[in]    n                   upper integer bound
-  @param[out]   count_vector        vector containing the number of decompositions for a natural number the corresponding index
-*/
-std::vector<size_t> count_goldbach(size_t n);
-
-
-/** 	 Vector @p b adds its elements to vector @p a .
-	@param[in] a	vector
-	@param[in] b	vector
-	@return 	     a+=b componentwise
-*/
-template<class T>
-std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
-{
-    assert(a.size()==b.size());
-    for (size_t k = 0; k < a.size(); ++k) {
-        a[k] += b[k];
-    }
-    return a;
-}
-
-// Declare the reduction operation in OpenMP for an STL-vector
-//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
-// ------------------------------------------------------------
-// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
-// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
-// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
-//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
-#pragma omp declare reduction(VecAdd : std::vector<size_t>  : omp_out += omp_in) initializer (omp_priv=omp_orig)
--- a/cpp_workspace/ex5/ex5_3/main.cpp
+++ b/cpp_workspace/ex5/ex5_3/main.cpp
@ -1,45 +0,0 @@
-#include "goldbach.h"
-#include <algorithm>
-#include <iostream>
-#include <omp.h>
-using namespace std;
-
-
-int main()
-{
-    cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
-
-    for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
-    {
-        double t_start = omp_get_wtime();
-
-        auto goldbach_vector = count_goldbach(n);
-
-        auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
-        size_t max_number = distance(goldbach_vector.begin(), max_it);
-
-        double t_end = omp_get_wtime() - t_start;
-
-        cout << "The number " <<  max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
-    }
-
-    /*
-    ###### WITHOUT PARALLELIZATION ######
-    The number 9240 has 329 decompositions. Duration: 0.00307696
-    The number 99330 has 2168 decompositions. Duration: 0.189839
-    The number 390390 has 7094 decompositions. Duration: 1.3042
-    The number 990990 has 15594 decompositions. Duration: 5.45034
-    The number 1981980 has 27988 decompositions. Duration: 47.1807
-
-
-    ###### WITH PARALLELIZATION ######
-    The number 9240 has 329 decompositions. Duration: 0.000734854
-    The number 99330 has 2168 decompositions. Duration: 0.0251322
-    The number 390390 has 7094 decompositions. Duration: 0.487375
-    The number 990990 has 15594 decompositions. Duration: 6.16972
-    The number 1981980 has 27988 decompositions. Duration: 31.5699
-    */
-
-
-    return 0;
-}
--- a/cpp_workspace/ex5/ex5_4/Makefile
+++ b/cpp_workspace/ex5/ex5_4/Makefile
@ -1,30 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-include ../${COMPILER}default.mk
--- a/cpp_workspace/ex5/ex5_4/benchmark_tests.h
+++ b/cpp_workspace/ex5/ex5_4/benchmark_tests.h
@ -1,13 +0,0 @@
-#pragma once
-#include <vector>
-using namespace std;
-
-vector<double> test_A(const size_t &NLOOPS, const size_t &N);
-
-vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
-
-vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
-
-vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
-
-vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/cpp_workspace/ex5/ex5_4/benchmarks.cpp
+++ b/cpp_workspace/ex5/ex5_4/benchmarks.cpp
@ -1,141 +0,0 @@
-#include "benchmarks.h"
-#include <cassert>       // assert()
-#include <cmath>
-#include <iostream>
-#include <vector>
-#include <omp.h>
-
-// (A) Inner product of two vectors (from skalar_stl)
-double scalar_parallel(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size());
-    size_t const N = x.size();
-    double sum = 0.0;
-//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
-#pragma omp parallel for shared(x, y, N) reduction(+:sum)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i] * y[i];
-    }
-    return sum;
-}
-
-// (A) Vector entry sum
-double sum(vector<double> const &x)
-{
-    double sum = 0.0;
-#pragma omp parallel for shared(x) reduction(+:sum)
-    for (size_t i = 0; i < x.size(); ++i)
-    {
-        sum += x[i];
-    }
-    return sum;
-}
-
-
-// (B) Matrix-vector product (from intro_vector_densematrix)
-vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
-{
-    size_t const nelem = A.size();
-    size_t const N = x.size();
-    assert(nelem % N == 0); // make sure multiplication is possible
-    size_t const M = nelem/N;          
-
-    vector<double> b(M);    
-
-#pragma omp parallel for shared(A, x, N, M, b)
-    for (size_t i = 0; i < M; ++i)
-    {
-        double tmp = 0.0;
-        for (size_t j = 0; j < N; ++j)
-            tmp += A[N*i + j] * x[j];
-        b[i] = tmp;
-    }
-
-    return b;
-}
-
-
-// (C) Matrix-matrix product
-vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
-{
-    size_t const nelem_A = A.size();
-    size_t const nelem_B = B.size();
-
-    assert(nelem_A % L == 0 && nelem_B % L == 0);
-
-    size_t const M = nelem_A/L;
-    size_t const N = nelem_B/L;
-
-
-    vector<double> C(M*N);
-
-            
-#pragma omp parallel for shared(A, B, M, N, L, C)
-    for (size_t i = 0; i < M; ++i)
-    {
-        for (size_t k = 0; k < L; ++k)
-        {
-            for (size_t j = 0; j < N; ++j)
-            {
-                C[N*i + j] += A[L*i + k]*B[N*k + j];
-            }
-            
-        }
-    }
-
-    return C;
-}
-
-
-// (D) Evaluation of a polynomial function
-vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
-{
-    size_t const N = x.size();
-    size_t const p = a.size() - 1;
-    vector<double> y(N, 0);
-
-#pragma omp parallel for shared(a, x, N, p, y)
-    for (size_t i = 0; i < N; ++i)
-    {
-        double x_temp = x[i];
-        double y_temp = 0;
-        for (size_t k = 0; k < p + 1; ++k)
-        {
-            y_temp += x_temp*y_temp + a[p - k];
-        }
-        y[i] = y_temp;
-    }
-
-    return y;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/cpp_workspace/ex5/ex5_4/benchmarks.h
+++ b/cpp_workspace/ex5/ex5_4/benchmarks.h
@ -1,55 +0,0 @@
-#pragma once
-#include <vector>
-using namespace std;
-
-/**     (A) Inner product of two vectors (from skalar_stl)
-	@param[in] x	vector
-	@param[in] y	vector
-	@return 	    resulting Euclidian inner product <x,y>
-*/
-double scalar_parallel(vector<double> const &x, vector<double> const &y);
-
-
-/**     (A) Sum entries of vector
-	@param[in] x	vector
-	@return 	    sum
-*/
-double sum(vector<double> const &x);
-
-
-/** 	(B) Matrix-vector product (from intro_vector_densematrix)
- * 	@param[in] A	dense matrix (1D access)
- *  @param[in] u	vector
- *
- *	@return    resulting vector
-*/
-vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
-
-
-/** 	(C) Matrix-matrix product
- * 	@param[in] A			MxL dense matrix (1D access)
- *  @param[in] B			LxN dense matrix (1D access)
- *  @param[in] shared_dim 	shared dimension L
- *
- *	@return    resulting MxN matrix
-*/
-vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
-
-
-/** 	(D) Evaluation of a polynomial function using Horner's scheme
- * 	@param[in] a	coefficient vector
- *  @param[in] x	vector with input values
- *
- *	@return    vector with output values
-*/
-vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
-
-
-
-
-
-
-
-
-
-
--- a/cpp_workspace/ex5/ex5_4/main.cpp
+++ b/cpp_workspace/ex5/ex5_4/main.cpp
@ -1,84 +0,0 @@
-#include "benchmark_tests.h"
-#include <iostream>
-#include <cmath>
-
-int main()
-{   
-    vector<vector<double>> results_scalar;
-    results_scalar.push_back(test_A(2000000, pow(10,3)));
-    results_scalar.push_back(test_A(1000000, pow(10,4)));
-    results_scalar.push_back(test_A(100000, pow(10,5)));
-    results_scalar.push_back(test_A(10000, pow(10,6)));
-    results_scalar.push_back(test_A(750, pow(10,7)));
-    results_scalar.push_back(test_A(125, pow(10,8)));
-
-
-    vector<vector<double>> results_sum;
-    results_sum.push_back(test_A_sum(3000000, pow(10,3)));
-    results_sum.push_back(test_A_sum(2000000, pow(10,4)));
-    results_sum.push_back(test_A_sum(1000000, pow(10,5)));
-    results_sum.push_back(test_A_sum(50000, pow(10,6)));
-    results_sum.push_back(test_A_sum(2000, pow(10,7)));
-    results_sum.push_back(test_A_sum(250, pow(10,8)));
-
-
-    test_B(100, 20000, 10000);
-
-    test_C(25, 500, 1000, 1500);
-
-    test_D(100, 100, 1000000);
-    
-
-
-    cout << endl << "###### Scalar ######" << endl;
-    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
-    cout << "------------------------------" << endl;
-    for (size_t i = 0; i < results_scalar.size(); ++i)
-        cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
-
-    cout << endl << "###### Sum ######" << endl;
-    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
-    cout << "------------------------------" << endl;
-    for (size_t i = 0; i < results_sum.size(); ++i)
-        cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
-
-
-
-    
-    // ###### Scalar ######
-    // Timing  GFLOPS  GiByte/s
-    // ------------------------------
-    // 3.4e-06 0.54    4.3
-    // 4.6e-06 4       32
-    // 1.6e-05 12      95
-    // 0.0011  1.7     13
-    // 0.0097  1.9     15
-    // 0.075   2.5     20
-
-
-    // ###### Sum ######
-    // Timing  GFLOPS  GiByte/s
-    // ------------------------------
-    // 5.5e-06 0.17    1.3
-    // 5.4e-06 1.7     14
-    // 1.5e-05 6.1     49
-    // 0.00013 7.2     57
-    // 0.0033  2.8     23
-    // 0.032   2.9     23
-
-    
-
-    
-    
-    // ######### NOT PARALLEL (from exercise sheet 2) #########
-    //      Timing  GFLOPS  GiByte/s
-    // ----------------------------------
-    // (A)  0.038   2.5     20
-    // (B)  0.13    2.9     23
-    // (C)  0.44    3.2     25
-    // (D)  0.19    1.5     12
-
-
-
-    return 0;
-}
--- a/ex1A_mean_values/main.cpp
+++ b/ex1A_mean_values/main.cpp
@ -0,0 +1,35 @@
+#include "means.h"
+#include <vector>
+#include <iostream>
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    double arithmetic_mean, geometric_mean, harmonic_mean;
+
+    // Fixed version
+    calculate_means(1, 4, 16, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+
+    calculate_means(2, 3, 5, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+
+    calculate_means(1000, 4000, 16000, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+    cout << "--------------------------------" << endl;
+
+
+    
+    // Scalable version
+    calculate_means(vector<int> {1, 4, 16}, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+    
+    calculate_means(vector<int> {2, 3, 5}, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+
+    calculate_means(vector<int> {1000, 4000, 16000}, arithmetic_mean, geometric_mean, harmonic_mean);
+    cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
+    
+    
+    return 0;
+}
--- a/ex1A_mean_values/means.cpp
+++ b/ex1A_mean_values/means.cpp
@ -0,0 +1,30 @@
+#include "../ex1A_mean_values/means.h"
+#include <cmath>
+#include <vector>
+
+void calculate_means(int a, int b, int c, double &am, double &gm, double &hm)
+{
+    am = (a + b + c)/3.0;
+    gm = exp((log(a)+log(b)+log(c))/3);
+    hm = 3.0/(1.0/a + 1.0/b + 1.0/c);
+}
+
+void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm)
+{
+    int n = numbers.size();
+
+    am = 0.;
+    gm = 0.;
+    hm = 0.;
+
+    for (int i = 0; i < n; ++i)
+    {
+        am += numbers[i];
+        gm += log(numbers[i]);
+        hm += 1.0/numbers[i];
+    }
+
+    am /= n;
+    gm = exp(gm/n);
+    hm = n/hm;
+}
--- a/ex1A_mean_values/means.h
+++ b/ex1A_mean_values/means.h
@ -0,0 +1,23 @@
+#pragma once
+#include <vector>
+
+/**
+  This function calculates arithmetic mean, geometric mean and harmonic mean of three integers.
+  @param[in]    a            first integer
+  @param[in]    b            second integer
+  @param[in]    c            third integer
+  @param[out]   am           arithmetic mean
+  @param[out]   gm           geometric mean
+  @param[out]   hm           harmonic mean
+*/
+void calculate_means(int a, int b, int c, double &am, double &gm, double &hm);
+
+/**
+  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
+  @param[in]    numbers      vector containing integers
+  @param[out]   am           arithmetic mean
+  @param[out]   gm           geometric mean
+  @param[out]   hm           harmonic mean
+*/
+void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm);
+
--- a/ex1B_data-IO_and_vectors/Makefile
+++ b/ex1B_data-IO_and_vectors/Makefile
@ -13,7 +13,7 @@ COMPILER=GCC_
 # COMPILER=PGI_


-SOURCES = main.cpp mylib.cpp
+SOURCES = main.cpp ../ex1A_mean_values/means.cpp
 OBJECTS = $(SOURCES:.cpp=.o)

 PROGRAM	= main.${COMPILER}
--- a/ex1B_data-IO_and_vectors/data_1.txt
+++ b/ex1B_data-IO_and_vectors/data_1.txt
--- a/ex1B_data-IO_and_vectors/main.cpp
+++ b/ex1B_data-IO_and_vectors/main.cpp
@ -0,0 +1,53 @@
+#include "../ex1A_mean_values/means.h"
+#include <iostream>
+#include <fstream>
+#include <cmath>
+#include <vector>
+#include <algorithm>
+using namespace std;
+
+
+int main(int argc, char **argv)
+{
+    // read vector from file
+    vector<int> data_vector = {};
+
+    ifstream input_stream("data_1.txt"); 
+
+    int line;
+    while(input_stream >> line)
+    {
+        data_vector.push_back(line);
+    }
+    data_vector.shrink_to_fit();
+
+
+    // calculate minimum and maximum
+    vector<int>::iterator min_it = min_element(data_vector.begin(), data_vector.end());
+    vector<int>::iterator max_it = max_element(data_vector.begin(), data_vector.end());
+
+    // calculate arithmetic mean, geometric mean and harmonic mean
+    double am, gm, hm;
+    calculate_means(data_vector, am, gm, hm);
+
+
+    // calculate standard deviation
+    double sd = 0.;
+    int n = data_vector.size();
+    for (int i = 0; i < n; ++i)
+    {
+        sd += pow(data_vector[i] - am, 2);
+    }
+    sd = sqrt(sd/n);
+
+
+    // print results
+    cout << "minimum: " << *min_it << endl;
+    cout << "maximum: " << *max_it << endl;
+    cout << "arithmetic mean: " << am << endl;
+    cout << "geometric mean: " << gm << endl;
+    cout << "harmonic mean: " << hm << endl;
+    cout << "standard deviation: " << sd << endl;
+
+    return 0;
+}
--- a/ex1C_summation_of_specified_numbers/main.cpp
+++ b/ex1C_summation_of_specified_numbers/main.cpp
@ -0,0 +1,31 @@
+#include "special_sum.h"
+#include "../utils/timing.h"
+#include <iostream>
+#include <chrono>
+#include <stack>
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    // check results and compare speeds
+    for(size_t n : {15, 1001, 1432987})
+    {
+        cout << "n = " << n << endl;
+        size_t sum_1, sum_2;
+
+        tic();
+        for(size_t i = 0; i < 1000; ++i)
+            sum_1 = special_sum_loop(n);
+        double time_1 = toc();
+
+        tic();
+        for(size_t i = 0; i < 1000; ++i)
+            sum_2 = special_sum_noloop(n);
+        double time_2 = toc();
+            
+        cout << "loop: " << sum_1 << "\t\tDuration: " << time_1 << endl;
+        cout << "no loop: " << sum_2 << "\t\tDuration: " << time_2 << endl << "---------------------------------------------------" << endl;
+    }
+
+    return 0;
+}
--- a/ex1C_summation_of_specified_numbers/special_sum.cpp
+++ b/ex1C_summation_of_specified_numbers/special_sum.cpp
@ -0,0 +1,28 @@
+#include "special_sum.h"
+
+size_t gauss_sum(size_t n)
+{
+    return (n*(n+1))/2;
+}
+
+size_t special_sum_loop(size_t n)
+{
+    size_t sum = 0;
+    for (size_t i = 1; i < n+1; ++i)
+    {
+        if (i % 3 == 0 || i % 5 == 0)
+        {
+            sum += i;
+        }
+    }
+    return sum;
+}
+
+size_t special_sum_noloop(size_t n)
+{
+    size_t factor_3 = gauss_sum(n/3);   // dividing int by int automatically gets rounded off
+    size_t factor_5 = gauss_sum(n/5);
+    size_t factor_15 = gauss_sum(n/15);
+
+    return factor_3*3 + factor_5*5 - factor_15*15;
+}
--- a/ex1C_summation_of_specified_numbers/special_sum.h
+++ b/ex1C_summation_of_specified_numbers/special_sum.h
@ -0,0 +1,18 @@
+#include <cstddef>
+
+/**
+  This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITH using a loop.
+  @param[in]    n         
+  @param[out]   M        
+*/
+size_t special_sum_loop(size_t n);
+
+
+/**
+  This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITHOUT using a loop.
+  Example: For n=15, we have 60 = 3+5+6+9+10+12+15 = (1+2+3+4+5)*3 + (1+2+3)*5 - 1*15
+  Formula: M = (\sum_{i=1}^{k_3} i)*3 + (\sum_{i=1}^{k_5} i)*5 - (\sum_{i=1}^{k_15} i)*15
+  @param[in]    n         
+  @param[out]   M        
+*/
+size_t special_sum_noloop(size_t n);
--- a/ex1D_kahan_summation/main.cpp
+++ b/ex1D_kahan_summation/main.cpp
@ -0,0 +1,33 @@
+#include "mylib.h"
+#include <cmath>
+#include <iostream>
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    for(size_t i = 1; i < 8; ++i)
+    {
+        size_t n = pow(10,i);
+        vector<double> x(n);
+        for (size_t k = 0; k < n; ++k)
+            x[k] = 1.0/(k + 1);
+            
+
+        // compute scalar products
+        double sum_1 = scalar(x, x);
+        double sum_2 = Kahan_skalar(x, x);
+
+        // compute error
+        double err_1 = abs(sum_1 - pow(M_PI,2)/6);
+        double err_2 = abs(sum_2 - pow(M_PI,2)/6);
+
+        cout << "n = " << n << endl;
+        cout << "Normal scalar product: " << sum_1 << "\terror: " << err_1 << endl;
+        cout << "Kahan scalar product: " << sum_2 << "\terror: " << err_2 << endl;
+        cout << endl;
+    }
+    
+    
+
+    return 0;
+}
--- a/ex1D_kahan_summation/mylib.cpp
+++ b/ex1D_kahan_summation/mylib.cpp
@ -0,0 +1,34 @@
+#include "mylib.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <vector>
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+
+
+double Kahan_skalar(vector<double> const &x, vector<double> const &y)
+{
+    double sum = 0;
+    double c = 0;
+    size_t n = x.size();
+    for (size_t i = 0; i < n; ++i)
+    {
+        double z = x[i]*y[i] - c;    // c is the part that got lost in the last iteration
+        double t = sum + z;          // when adding sum + z, the lower digits are lost if sum is large
+        c = (t - sum) - z;           // now we recover the lower digits to add in the next iteration
+        sum = t;
+    }
+    return sum;
+}
--- a/ex1D_kahan_summation/mylib.h
+++ b/ex1D_kahan_summation/mylib.h
@ -0,0 +1,30 @@
+#ifndef FILE_MYLIB
+#define FILE_MYLIB
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+
+/** 	Inner product using BLAS routines
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
+float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
+
+
+/** 	L_2 Norm of a vector
+	@param[in] x	vector
+	@return 	    resulting Euclidian norm <x,y>
+*/
+double norm(std::vector<double> const &x);
+
+double Kahan_skalar(std::vector<double> const &x,  std::vector<double> const &y);
+
+
+#endif
--- a/ex1E_vector_vs_list/main.cpp
+++ b/ex1E_vector_vs_list/main.cpp
@ -0,0 +1,59 @@
+#include "../utils/timing.h"
+#include <iostream>
+#include <random>
+#include <chrono>
+#include <vector>
+#include <list>
+#include <algorithm>
+using namespace std;
+
+
+size_t random_integer(int lower_bound, int upper_bound)
+{
+    unsigned seed = chrono::system_clock::now().time_since_epoch().count();
+
+    minstd_rand0 generator (seed);
+
+    return lower_bound + generator() % (upper_bound - lower_bound + 1);
+}
+
+int main(int argc, char **argv)
+{
+    // start with generating a sorted vector/list
+    size_t n = 10000;
+    vector<int> x_vec = {};
+    list<int> x_list = {};
+    for(size_t k = 0; k < n; ++k)
+    {
+        x_vec.push_back(k + 1);
+        x_list.push_back(k + 1);
+    }
+
+
+    // insert new random entries such that the container stays sorted
+    tic();
+    for(size_t i = 0; i < n; ++i)
+    {
+        size_t new_entry = random_integer(1,n);
+        auto it = lower_bound(x_vec.begin(), x_vec.end(), new_entry);
+        x_vec.insert(it, new_entry);
+    }
+    double time_1 = toc();
+
+    tic();
+    for(size_t i = 0; i < n; ++i)
+    {
+        size_t new_entry = random_integer(1,n);
+        auto it = lower_bound(x_list.begin(), x_list.end(), new_entry);
+        x_list.insert(it, new_entry);
+    }
+    double time_2 = toc();
+
+
+    // check results
+    cout << "New vector is sorted: " << std::boolalpha << is_sorted(x_vec.begin(), x_vec.end()) << "\tsize: " << x_vec.size() << "\tduration: " << time_1 << endl;
+    cout << "New list is sorted: " << std::boolalpha << is_sorted(x_list.begin(), x_list.end()) << "\tsize: " << x_list.size() << "\tduration: " << time_2 << endl;
+
+
+    return 0;
+}
--- a/ex1F_goldbachs_conjecture/goldbach.cpp
+++ b/ex1F_goldbachs_conjecture/goldbach.cpp
@ -1,7 +1,4 @@
 #include "goldbach.h"
-#include <iostream>
-#include <iterator>
-#include <omp.h>

 size_t single_goldbach(size_t k)
 {
@ -10,13 +7,12 @@ size_t single_goldbach(size_t k)

    size_t counter = 0;

-#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
    for(size_t i = 0; i < m; ++i)
    {
        for(size_t j = i; j < m; ++j)
        {
            if(relevant_primes[i] + relevant_primes[j] == k)
-                ++counter;
+                counter += 1;
        }
    }

@ -31,7 +27,7 @@ std::vector<size_t> count_goldbach(size_t n)

    std::vector<size_t> counter_vector(n + 1, 0);

-#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
+
    for(size_t i = 0; i < m; ++i)
    {
        for(size_t j = i; j < m; ++j)
--- a/ex1F_goldbachs_conjecture/goldbach.h
+++ b/ex1F_goldbachs_conjecture/goldbach.h
@ -0,0 +1,21 @@
+#pragma once
+#include "mayer_primes.h"
+#include <iostream>
+#include <vector>
+#include <iterator>
+#include <cassert>
+
+
+/**
+  This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
+  @param[in]    k            first integer
+  @param[out]   count        number of decompositions
+*/
+size_t single_goldbach(size_t k);
+
+/**
+  This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
+  @param[in]    n                   upper integer bound
+  @param[out]   count_vector        vector containing the number of decompositions for a natural number the corresponding index
+*/
+std::vector<size_t> count_goldbach(size_t n);
--- a/ex1F_goldbachs_conjecture/main.cpp
+++ b/ex1F_goldbachs_conjecture/main.cpp
@ -0,0 +1,37 @@
+#include "../utils/timing.h"
+#include "goldbach.h"
+#include <iostream>
+#include <algorithm>
+using namespace std;
+
+
+int main(int argc, char **argv)
+{
+    cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
+
+
+    for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
+    {
+        tic();
+
+        auto goldbach_vector = count_goldbach(n);
+
+        auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
+        size_t max_number = distance(goldbach_vector.begin(), max_it);
+
+        double time = toc();
+
+        cout << "The number " <<  max_number << " has " << *max_it << " decompositions. Duration: " << time << endl;
+    }
+
+    /*
+    The number 9240 has 329 decompositions. Duration: 0.00572876
+    The number 99330 has 2168 decompositions. Duration: 0.3342
+    The number 390390 has 7094 decompositions. Duration: 4.23734
+    The number 990990 has 15594 decompositions. Duration: 29.5817
+    The number 1981980 has 27988 decompositions. Duration: 135.985
+    */
+
+
+    return 0;
+}
--- a/ex1F_goldbachs_conjecture/mayer_primes.h
+++ b/ex1F_goldbachs_conjecture/mayer_primes.h
--- a/ex1G_dense_matrices_access/DenseMatrix.h
+++ b/ex1G_dense_matrices_access/DenseMatrix.h
@ -0,0 +1,71 @@
+#pragma once
+#include "sigmoid.h"
+#include <iostream>
+#include <vector>
+using namespace std;
+
+class DenseMatrix
+{
+    private:
+        vector<double> M;
+        size_t n;
+        size_t m;
+
+
+    public:
+        vector<double> Mult(const vector<double> &x) const
+        {
+            vector<double> y(n,0);
+            for(size_t i = 0; i < n; ++i)   // iterate row
+            {
+                for(size_t j = 0; j < m; ++j)   // iterate column
+                {
+                    y[i] += M[i*m + j]*x[j];
+                }
+            }
+            return y;
+        }
+
+        vector<double> MultT(const vector<double> &y) const
+        {
+            vector<double> x(m,0);
+            for(size_t j = 0; j < m; ++j)   // iterate column
+            {
+                for(size_t i = 0; i < n; ++i)   // iterate row
+                {
+                    x[j] += M[i*m + j]*y[i];
+                }
+            }
+            return x;
+        }
+
+        void Print() const
+        {
+            for(size_t i = 0; i < n; ++i)   // iterate row
+            {
+                for(size_t j = 0; j < m; ++j)   // iterate column
+                {
+                    cout << M[i*m + j] << "  ";
+                }
+                cout << endl;
+            }
+            cout << endl;
+        }
+
+
+        DenseMatrix(size_t n, size_t m)
+        {
+            this->n = n;
+            this->m = m;
+            M = vector<double>(n*m);
+            size_t nm = max(n,m);
+
+            for(size_t i = 0; i < n; ++i)   // iterate row
+            {
+                for(size_t j = 0; j < m; ++j)   // iterate column
+                {
+                    M[i*m + j] = sigmoid(x_entry(i,nm))*sigmoid(x_entry(j,nm));
+                }
+            }
+        }
+};
--- a/ex1G_dense_matrices_access/ProductMatrix.h
+++ b/ex1G_dense_matrices_access/ProductMatrix.h
@ -0,0 +1,52 @@
+#pragma once
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+class ProductMatrix
+{
+    private:
+        vector<double> u;
+        vector<double> v;
+        size_t n;
+        size_t m;
+
+    public:
+        vector<double> Mult(const vector<double> &x) const
+        {
+            vector<double> y(n,0);
+            for(int i = 0; i < n; ++i)
+            {
+                for(int j = 0; j < m; ++j)
+                {
+                    y[i] += v[j]*x[j];
+                }
+                y[i] *= u[i];
+            }
+            return y;
+        }
+
+        vector<double> MultT(const vector<double> &y) const
+        {
+            vector<double> x(m,0);
+            for(int j = 0; j < m; ++j)
+            {
+                for(int i = 0; i < n; ++i)
+                {
+                    x[j] += y[i]*u[i];
+                }
+                x[j] *= v[j];
+            }
+            return x;
+        }
+
+        ProductMatrix(const vector<double> &u, const vector<double> &v)
+        {
+            n = u.size();
+            m = v.size();
+            this->u = u;
+            this->v = v;
+            
+        }
+};
--- a/ex1G_dense_matrices_access/main.cpp
+++ b/ex1G_dense_matrices_access/main.cpp
@ -0,0 +1,93 @@
+#include "../utils/timing.h"
+#include "DenseMatrix.h"
+#include "ProductMatrix.h"
+#include <algorithm>
+
+int main()
+{
+    // b) ------------------------------------------------------------------------------------------------------
+    DenseMatrix const M(5,3);
+    vector<double> const u{{1, 2, 3}};
+    vector<double> f1 = M.Mult(u);
+    vector<double> const v{{-1, 2, -3, 4, -5}};
+    vector<double> f2 = M.MultT(v);
+
+    M.Print();
+
+    for(size_t i = 0; i < f1.size(); ++i)
+        cout << f1[i] << endl;
+    cout << endl;
+
+
+    for(size_t j = 0; j < f2.size(); ++j)
+        cout << f2[j] << "  ";
+    cout << endl << "-------------------------------------------------" << endl;
+
+    // c) ------------------------------------------------------------------------------------------------------
+    size_t n = pow(10,3);
+    DenseMatrix const M_1(n,n);
+    vector<double> x(n, 1.0);
+
+    size_t n_loops = 100;
+    vector<double> y_1;
+    vector<double> y_2;
+
+    double time_1 = 0;
+    double time_2 = 0;
+
+    tic();
+    for(int l = 0; l < n_loops; ++l)
+        y_1 = M_1.Mult(x);
+    time_1 += toc();
+
+    tic();
+    for(int l = 0; l < n_loops; ++l)
+        y_2 = M_1.MultT(x);
+    time_2 += toc();
+
+    vector<double> error_vec(n,0);
+    for(int i = 0; i < n; ++i)
+        error_vec[i] = abs(y_1[i] - y_2[i]);
+    double sup_error = *max_element(error_vec.begin(), error_vec.end());
+
+
+    cout << "n = " << n << endl;
+    cout << "Average duration for Mult: " << time_1/n_loops << endl;
+    cout << "Average duration for MultT: " << time_2/n_loops << endl;
+    cout << "sup-error: " << sup_error << endl;
+    cout << "-------------------------------------------------" << endl;
+
+    // d) ------------------------------------------------------------------------------------------------------
+    vector<double> u_M(n,0);
+    for(int i = 0; i < n; ++i)
+        u_M[i] = sigmoid(x_entry(i, n));
+
+    ProductMatrix const M_2(u_M, u_M);
+
+    time_1 = 0;
+    time_2 = 0;
+
+    tic();
+    for(int l = 0; l < n_loops; ++l)
+        y_1 = M_2.Mult(x);
+    time_1 += toc();
+
+    tic();
+    for(int l = 0; l < n_loops; ++l)
+        y_2 = M_2.MultT(x);
+    time_2 += toc();
+
+    for(int i = 0; i < n; ++i)
+        error_vec[i] = abs(y_1[i] - y_2[i]);
+    sup_error = *max_element(error_vec.begin(), error_vec.end());
+
+
+    cout << "n = " << n << endl;
+    cout << "Average duration for Mult: " << time_1/n_loops << endl;
+    cout << "Average duration for MultT: " << time_2/n_loops << endl;
+    cout << "sup-error: " << sup_error << endl;
+    cout << "-------------------------------------------------" << endl;
+
+
+    return 0;
+}
--- a/ex1G_dense_matrices_access/sigmoid.h
+++ b/ex1G_dense_matrices_access/sigmoid.h
@ -0,0 +1,12 @@
+#pragma once
+#include <cmath>
+
+double sigmoid(double x)
+{
+    return 1./(1. + exp(-x));
+}
+
+double x_entry(size_t k, size_t nm)
+{
+    return (10.*k)/(nm - 1) - 5.;
+}
--- a/ex2A_stationary_heat_equation_fixed_lambda/ex2A.pdf
+++ b/ex2A_stationary_heat_equation_fixed_lambda/ex2A.pdf
--- a/ex2B_stationary_heat_equation_variable_lambda/ex2B.pdf
+++ b/ex2B_stationary_heat_equation_variable_lambda/ex2B.pdf
--- a/ex2C_peclet_problem/ex2C.pdf
+++ b/ex2C_peclet_problem/ex2C.pdf
--- a/ex2C_peclet_problem/main.py
+++ b/ex2C_peclet_problem/main.py
@ -0,0 +1,18 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def u(x,p):
+    return (np.exp(p*x) - 1)/(np.exp(p) - 1)
+
+x = np.linspace(0, 1, 200)
+
+
+plt.figure(figsize=(8, 5))
+for p in [ -3, -1, -0.5, 0.5,  2,  4]:
+    plt.plot(x, u(x, p), label="p ="+ str(p))
+
+plt.xlabel('x')
+plt.ylabel('u(x)')
+plt.legend()
+plt.grid(True)
+plt.show()
--- a/ex3_benchmarks/Doxyfile
+++ b/ex3_benchmarks/Doxyfile
--- a/cpp_workspace/ex5/ex5_2/Makefile
+++ b/cpp_workspace/ex5/ex5_2/Makefile
@ -13,7 +13,9 @@ COMPILER=GCC_
 # COMPILER=PGI_


-SOURCES = main.cpp mylib.cpp
+#SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp
+# GH
+SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp
 OBJECTS = $(SOURCES:.cpp=.o)

 PROGRAM	= main.${COMPILER}
@ -27,5 +29,4 @@ ifndef COMPILER
  COMPILER=GCC_
 endif

-
 include ../${COMPILER}default.mk
--- a/cpp_workspace/ex5/ex5_4/benchmark_tests.cpp
+++ b/cpp_workspace/ex5/ex5_4/benchmark_tests.cpp
@ -5,7 +5,7 @@
 #include <math.h>       
 using namespace std::chrono; 

-vector<double> test_A(const size_t &NLOOPS, const size_t &N)
+vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
 {
    cout << "#################### (A) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
@ -39,7 +39,7 @@ vector<double> test_A(const size_t &NLOOPS, const size_t &N)
    double check(0.0),ss(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
-        check = scalar_parallel(x, y);
+        check = scalar_function(x, y);
        ss += check;                   // prevents the optimizer from removing unused calculation results.
    }

@ -70,79 +70,37 @@ vector<double> test_A(const size_t &NLOOPS, const size_t &N)
    cout << "GFLOPS         : " << Gflops << endl;
    cout << "GiByte/s       : " << MemBandwidth << endl;

+//##########################################################################
+    cout << "\nStart Benchmarking norm\n";

-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
-
-vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
-{
-    cout << "#################### (A) sum ####################" << endl;
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nN = " << N << endl;
-
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> x(N);
-
-    cout.precision(2);
-    cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-
-//  Data initialization
-
-    for (size_t i = 0; i < N; ++i)
-    {
-        x[i] = 1;
-    }
-
-
-    cout << "\nStart Benchmarking sum\n";
-
-    auto t1 = system_clock::now(); // start timer
+    auto t3 = system_clock::now(); // start timer
 // Do calculation
-    double check(0.0),ss(0.0);
+    double ss2(0.0);
    for (size_t i = 0; i < NLOOPS; ++i)
    {
-        check = sum(x);
-        ss += check;                   // prevents the optimizer from removing unused calculation results.
+        auto sk1 = sqrt(scalar(x, x));
+        ss2 += sk1;                   // prevents the optimizer from removing unused calculation results.
    }

-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    auto t4 = system_clock::now();  // stop timer
+    auto duration2 = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+    double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
+    t_diff2 = t_diff2/NLOOPS;                                      // duration per loop seconds
+    
+
+    cout << "ss(norm): " << ss2 << endl;
+    cout << "Timing in sec. : " << t_diff2 << endl;
+
+
+
    


-// Check the correct result
-    cout << "\n <x,y> = " << check << endl;
-    if (static_cast<unsigned int>(check) != N)
-        cout << "  !!   W R O N G  result   !!\n";
-    cout << endl;
-
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-
-
-    double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
    return vector<double>{t_diff, Gflops, MemBandwidth};
 }


-vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
 {
    cout << "#################### (B) ####################" << endl;

@ -180,7 +138,7 @@ vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
    
    for (size_t i = 0; i < NLOOPS; ++i)
    {
-        b = MatVec_parallel(A, x);
+        b = MatVec_function(A, x);
    }

    auto t2 = system_clock::now();  // stop timer
@ -216,7 +174,7 @@ vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
 }


-vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
 {
    cout << "#################### (C) ####################" << endl;
    cout << "\nLOOPS = " << NLOOPS << endl;
@ -253,11 +211,11 @@ vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, co
 // Do calculation
    vector<double> C(M*N);
    double check;
-    double check_sum = 0;
+    double check_sum;                 // GH: initialize

    for (size_t i = 0; i < NLOOPS; ++i)
    {
-        C = MatMat_parallel(A, B, L);
+        C = MatMat_function(A, B, L);

        check = C[N*17];
        check_sum += check; // prevents the optimizer from removing unused calculation results.
@ -334,7 +292,7 @@ vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
    
    for (size_t i = 0; i < NLOOPS; ++i)
    {
-        y = poly_parallel(a, x);
+        y = poly(a, x);
        check = y[0];

        check_sum += check; // prevents the optimizer from removing unused calculation results.
@ -372,4 +330,4 @@ vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)


    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
+}
--- a/ex3_benchmarks/benchmark_tests.h
+++ b/ex3_benchmarks/benchmark_tests.h
@ -0,0 +1,15 @@
+#pragma once
+#include <vector>
+#include <functional>
+using namespace std;
+
+
+
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex3_benchmarks/benchmarks.cpp
+++ b/ex3_benchmarks/benchmarks.cpp
@ -0,0 +1,246 @@
+#include "benchmarks.h"
+#include "vdop.h"
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <cassert>       // assert()
+
+
+#ifdef __INTEL_CLANG_COMPILER
+#pragma message(" ##########  Use of MKL  ###############")
+#include <mkl.h>
+#else
+#pragma message(" ##########  Use of CBLAS  ###############")
+
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>             // Lapack
+
+#endif
+
+
+
+// (A) Inner product of two vectors (from skalar_stl)
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+// (A) 5.(b) Kahan scalar product
+double Kahan_skalar(vector<double> const &x, vector<double> const &y)
+{
+    double sum = 0.0;
+    double c = 0.0;
+    size_t n = x.size();
+    for (size_t i = 0; i < n; ++i)
+    {
+        double z = x[i]*y[i] - c;    // c is the part that got lost in the last iteration
+        double t = sum + z;          // when adding sum + z, the lower digits are lost if sum is large
+        c = (t - sum) - z;           // now we recover the lower digits to add in the next iteration
+        sum = t;
+    }
+    return sum;
+}
+
+// (A) 6. cBLAS scalar product
+double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
+{
+    return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
+}
+
+
+// (B) Matrix-vector product (from intro_vector_densematrix)
+vector<double> MatVec(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;          
+
+
+    vector<double> b(M);    
+
+    for (size_t i = 0; i < M; ++i)
+    {
+        double tmp = 0.0;
+        for (size_t j = 0; j < N; ++j)
+            tmp += A[N*i + j] * x[j];
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+// (B) cBLAS Matrix-vector product
+vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;
+
+
+    vector<double> b(M);
+
+    cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
+
+    return b;
+}
+
+
+// (C) Matrix-matrix product
+vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+            
+
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t j = 0; j < N; ++j)
+        {
+            double C_temp = 0;
+            for (size_t k = 0; k < L; ++k)
+            {
+                C_temp += A[L*i + k]*B[N*k + j];
+            }
+            C[N*i + j] = C_temp;
+        }
+    }
+
+    return C;
+}
+
+// (C) cBLAS matrix-matrix product
+vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+
+    return C;
+}
+
+
+// (D) Evaluation of a polynomial function
+vector<double> poly(vector<double> const &a, vector<double> const &x)
+{
+    size_t const N = x.size();
+    size_t const p = a.size() - 1;
+    vector<double> y(N, 0);
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        double x_temp = x[i];
+        double y_temp = 0;
+        for (size_t k = 0; k < p + 1; ++k)
+        {
+            y_temp += x_temp*y_temp + a[p - k];
+        }
+        y[i] = y_temp;
+    }
+
+    return y;
+}
+
+
+// (E) Solves linear system of equations
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    const int    maxiter = 1000;
+    const double tol  = 1e-5,                // tolerance
+                 tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k)
+    {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    double sigma0 = dscapr(w, r);            // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        sigma = dscapr(w, r);                // s0 := <w,r>
+//      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex3_benchmarks/benchmarks.h
+++ b/ex3_benchmarks/benchmarks.h
@ -0,0 +1,89 @@
+#pragma once
+#include "getmatrix.h"
+#include <vector>
+
+using namespace std;
+
+/**     (A) Inner product of two vectors (from skalar_stl)
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(vector<double> const &x, vector<double> const &y);
+
+/**     (A) 5.(b) Inner product of two vectors using the Kahan scalar product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double Kahan_skalar(vector<double> const &x,  vector<double> const &y);
+
+/** 	(A) 6. cBLAS scalar product of two vectors
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
+
+
+/** 	(B) Matrix-vector product (from intro_vector_densematrix)
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec(vector<double> const &A, vector<double> const &x);
+
+/** 	(B) 6. cBLAS Matrix-vector product
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
+
+
+/** 	(C) Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+/** 	(C) 6. cBLAS Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+
+/** 	(D) Evaluation of a polynomial function using Horner's scheme
+ * 	@param[in] a	coefficient vector
+ *  @param[in] x	vector with input values
+ *
+ *	@return    vector with output values
+*/
+vector<double> poly(vector<double> const &a, vector<double> const &x);
+
+
+/**     (E) Solves linear system of equations  K @p u = @p f  via the Jacobi iteration (from jaboci_oo_stl)
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);
+
+
+
+
+
+
+
+
--- a/ex3_benchmarks/factorization_solve.cpp
+++ b/ex3_benchmarks/factorization_solve.cpp
@ -0,0 +1,39 @@
+#include "factorization_solve.h"
+#include <vector>
+#include <math.h>
+#include <assert.h>
+#include <lapack.h>
+#include <iostream>
+
+using namespace std;
+
+
+void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs)
+{
+    int32_t nelem = A.size();
+    int32_t N = sqrt(nelem);
+    assert(N == static_cast<int32_t>(b.size())/n_rhs);
+
+    vector<int> IPIV(N); // pivot indices
+    int info_factorization;
+
+    dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization);
+
+    const char transp = 'N';
+    int info_solve;
+    dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N'
+
+}
+
+void print_matrix(vector<double> &A, size_t M, size_t N)
+{
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t j = 0; j < N; ++j)
+        {
+            cout << A[N*i + j] << "\t";
+        }
+        cout << endl;
+    }
+    cout << endl;
+}
--- a/ex3_benchmarks/factorization_solve.h
+++ b/ex3_benchmarks/factorization_solve.h
@ -0,0 +1,21 @@
+#pragma once
+#include <vector>
+#include <cstdint>
+using namespace std;
+
+
+/** 	Solve linear system of equations with multiple right hand sides using LU factorization
+ * 	@param[inout] A			NxN Matrix (1D access), gets modified to contain the LU decomposition of A
+ *  @param[inout] B			N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x
+ *  @param[in] n_rhs 	    number of right-hand-sides b
+ *
+*/
+void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs);
+
+/** 	Print a matrix to console
+ * 	@param[in] A			MxN Matrix (1D access)
+ *  @param[in] M            rows			
+ *  @param[in] N            columns 	    
+ *
+*/
+void print_matrix(vector<double> &A, size_t M, size_t N);
--- a/ex3_benchmarks/factorization_solve_tests.cpp
+++ b/ex3_benchmarks/factorization_solve_tests.cpp
@ -0,0 +1,89 @@
+#include "factorization_solve_tests.h"
+#include "factorization_solve.h"
+#include "benchmarks.h"
+#include <chrono>
+#include <iostream>
+
+using namespace std::chrono; 
+
+void CheckCorrectness()
+{
+    cout.precision(4);
+
+    size_t N = 5;
+    size_t n_rhs = 5;
+    vector<double> A(N*N);
+    vector<double> b(N*n_rhs);
+
+    // initialize A
+    for (size_t i = 0; i < N; ++i)
+    {
+        for (size_t j = 0; j < N; ++j)
+            if (i == j)
+                A[N*i + j] = 4.0;
+            else
+                A[N*i + j] = 1.0/((i - j)*(i - j));
+    }
+    const vector<double> A_copy = A;
+        
+    // initialize b as identity matrix
+    for (size_t j = 0; j < N; ++j)
+    {
+        for (size_t l = 0; l < n_rhs; ++l)
+            if (l == j)
+                b[N*l + j] = 1.0;
+            else
+                b[N*l + j] = 0.0;
+    }
+    
+    cout << "A = " << endl;
+    print_matrix(A, N, N);
+    cout << "b = " << endl;
+    print_matrix(b, N, n_rhs);
+
+    // solve system
+    factorization_solve(A, b, n_rhs);
+
+    vector<double> check_matrix_identity = MatMat_cBLAS(A_copy, b, N);
+    cout << "A*A^{-1} = " << endl;
+    print_matrix(check_matrix_identity, N, n_rhs);
+}
+
+
+void CheckDuration(const size_t &N)
+{
+    for (size_t n_rhs : {1, 2, 4, 8, 16, 32})
+    {
+        auto time_start = system_clock::now();
+
+        vector<double> A(N*N);
+        vector<double> b(N*n_rhs);
+
+        // initialize A
+        for (size_t i = 0; i < N; ++i)
+        {
+            for (size_t j = 0; j < N; ++j)
+                if (i == j)
+                    A[N*i + j] = 4.0;
+                else
+                    A[N*i + j] = 1.0/((i - j)*(i - j));
+        }
+
+        // initialize b
+        for (size_t j = 0; j < N; ++j)
+            for (size_t l = 0; l < n_rhs; ++l)
+                b[N*l + j] = 1.0;
+
+
+        // solve system
+        factorization_solve(A, b, n_rhs);
+
+
+
+        auto time_end = system_clock::now();
+        auto duration = duration_cast<microseconds>(time_end - time_start);        // duration in microseconds
+        double t_diff = static_cast<double>(duration.count()) / 1e6;               // overall duration in seconds
+
+        cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl;
+    }
+}
--- a/ex3_benchmarks/factorization_solve_tests.h
+++ b/ex3_benchmarks/factorization_solve_tests.h
@ -0,0 +1,7 @@
+#pragma once
+#include <cstddef>
+
+
+void CheckCorrectness();
+
+void CheckDuration(const size_t &N);
--- a/ex3_benchmarks/geom.cpp
+++ b/ex3_benchmarks/geom.cpp
@ -0,0 +1,522 @@
+// see:   http://llvm.org/docs/CodingStandards.html#include-style
+#include "geom.h"
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <string>
+#include <vector>
+using namespace std;
+
+Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
+    : _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
+{
+}
+
+Mesh::~Mesh()
+{}
+
+void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
+{
+    int const nnode = Nnodes();            // number of vertices in mesh
+    assert( nnode == static_cast<int>(v.size()) );
+    for (int k = 0; k < nnode; ++k)
+    {
+        v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
+    }
+}
+
+
+void Mesh::Debug() const
+{
+    cout << "\n ############### Debug  M E S H  ###################\n";
+    cout << "\n ...............    Coordinates       ...................\n";
+    for (int k = 0; k < _nnode; ++k)
+    {
+        cout << k << " : " ;
+        for (int i = 0; i < _ndof_e; ++i )
+        {
+            cout << _xc[2*k+i] << "  ";
+        }
+        cout << endl;
+    }
+    cout << "\n ...............    Elements        ...................\n";
+    for (int k = 0; k < _nelem; ++k)
+    {
+        cout << k << " : ";
+        for (int i = 0; i < _ndof_e; ++i )
+            cout << _ia[_ndof_e * k + i] << "  ";
+        cout << endl;
+    }
+    return;
+}
+
+void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
+{
+    assert(Nnodes() ==  static_cast<int>(v.size()));  // fits vector length to mesh information?
+
+    ofstream fout(fname);                             // open file ASCII mode
+    if ( !fout.is_open() )
+    {
+        cout << "\nFile " << fname << " has not been opened.\n\n" ;
+        assert( fout.is_open() && "File not opened."  );
+    }
+
+    string const DELIMETER(" ");    // define the same delimeter as in matlab/ascii_read*.m
+    int const    OFFSET(1);         // convert C-indexing to matlab
+
+    // Write data: #nodes, #space dimensions, #elements, #vertices per element
+    fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
+
+    // Write cordinates: x_k, y_k   in seperate lines
+    assert( Nnodes()*Ndims() ==  static_cast<int>(_xc.size()));
+    for (int k = 0, kj = 0; k < Nnodes(); ++k)
+    {
+        for (int j = 0; j < Ndims(); ++j, ++kj)
+        {
+            fout << _xc[kj] << DELIMETER;
+        }
+        fout << endl;
+    }
+
+    // Write connectivity: ia_k,0, ia_k,1 etc  in seperate lines
+    assert( Nelems()*NverticesElements() ==  static_cast<int>(_ia.size()));
+    for (int k = 0, kj = 0; k < Nelems(); ++k)
+    {
+        for (int j = 0; j < NverticesElements(); ++j, ++kj)
+        {
+            fout << _ia[kj] + OFFSET << DELIMETER;     // C to matlab
+        }
+        fout << endl;
+    }
+
+    // Write vector
+    for (int k = 0; k < Nnodes(); ++k)
+    {
+        fout << v[k] << endl;
+    }
+
+    fout.close();
+    return;
+}
+
+
+void Mesh::Visualize(std::vector<double> const &v) const
+{
+    // define external command
+    const string exec_m("matlab -nosplash < visualize_results.m");                 // Matlab
+    //const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
+    //const string exec_m("flatpak run org.octave.Octave visualize_results.m");      // Octave (flatpak): desktop GH
+
+    const string fname("uv.txt");
+    Write_ascii_matlab(fname, v);
+
+    int ierror = system(exec_m.c_str());                                 // call external command
+
+    if (ierror != 0)
+    {
+        cout << endl << "Check path to Matlab/octave on your system" << endl;
+    }
+    cout << endl;
+    return;
+}
+
+
+
+
+// #####################################################################
+Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
+    : Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
+      _myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
+_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
+{
+    //void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
+    int const ky = _myid / _procx;
+    int const kx = _myid % _procy;  //    MOD(myid,procx)
+    // Determine the neighbors of domain/rank myid
+    _neigh[0] = (ky == 0)       ?  -1 : _myid - _procx;    //   South
+    _neigh[1] = (kx == _procx - 1) ?  -1 : _myid + 1;      //   East
+    _neigh[2] = (ky == _procy - 1) ?  -1 : _myid + _procx;  //   North
+    _neigh[3] = (kx == 0)       ?  -1 : _myid - 1;        //   West
+
+    _color = (kx + ky) & 1 ;
+
+    // subdomain is part of unit square
+    double const hx = 1. / _procx;
+    double const hy = 1. / _procy;
+    _xl = kx * hx;                      //  left
+    _xr = (kx + 1) * hx;                //  right
+    _yb = ky * hy;                      //  bottom
+    _yt = (ky + 1) * hy;                //  top
+
+    // Calculate coordinates
+    int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
+    Resize_Coords(nnode, 2);                 // coordinates in 2D [nnode][ndim]
+    GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
+
+    // Calculate element connectivity (linear triangles)
+    int const nelem = 2 * _nx * _ny;         // number of elements
+    Resize_Connectivity(nelem, 3);           // connectivity matrix [nelem][3]
+    GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
+
+    return;
+}
+
+
+void Mesh_2d_3_square::SetU(std::vector<double> &u) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            u[k] = 0.0;
+        }
+    }
+
+}
+
+void Mesh_2d_3_square::SetF(std::vector<double> &f) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            f[k] = 1.0;
+        }
+    }
+
+}
+
+
+std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
+{
+    int const dx = 1,
+              dy = _nx + 1,
+              bl  = 0,
+              br  = _nx,
+              tl  = _ny * (_nx + 1),
+              tr  = (_ny + 1) * (_nx + 1) - 1;
+    int const start[4] = { bl, br, tl, bl},
+                         end[4] = { br, tr, tr, tl},
+                                  step[4] = { dx, dy, dx, dy};
+
+    vector<int> idx(0);
+    for (int j = 0; j < 4; j++)
+    {
+        if (_neigh[j] < 0)
+        {
+            for (int i = start[j]; i <= end[j]; i += step[j])
+            {
+                idx.push_back(i);        // node i is Dirichlet node
+            }
+        }
+    }
+    //    remove multiple elements
+    sort(idx.begin(), idx.end());                          // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
+{
+//  construct the file name for subdomain myid
+    const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
+
+    const string namep = name + "." + tmp;
+    ofstream ff(namep.c_str());
+    ff.precision(6);
+    ff.setf(ios::fixed, ios::floatfield);
+
+    // assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
+    // output is provided for tensor product grid visualization ( similar to Matlab-surf() )
+    auto const &xc = GetCoords();
+    int k = 0;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        for (int i = 0; i <= _nx; ++i, ++k)
+            ff << xc[2 * k + 0] << "   " << xc[2 * k + 1] << "   " << u[k] << endl;
+        ff << endl;
+    }
+
+    ff.close();
+    return;
+}
+
+void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
+        double const xl, double const xr, double const yb, double const yt,
+        double xc[])
+{
+    const double hx = (xr - xl) / nx,
+                 hy = (yt - yb) / ny;
+
+    int k = 0;
+    for (int j = 0; j <= ny; ++j)
+    {
+        const double y0 = yb + j * hy;
+        for (int i = 0; i <= nx; ++i, k += 2)
+        {
+            xc[k  ] = xl + i * hx;
+            xc[k + 1] = y0;
+        }
+    }
+
+    return;
+}
+
+void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
+{
+    const int dx = nx + 1;
+    int k  = 0;
+    int l  = 0;
+    for (int j = 0; j < ny; ++j, ++k)
+    {
+        for (int i = 0; i < nx; ++i, ++k)
+        {
+            ia[l  ] = k;
+            ia[l + 1] = k + 1;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+            ia[l  ] = k;
+            ia[l + 1] = k + dx;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+        }
+    }
+    return;
+}
+
+// #################### still some old code (--> MPI) ############################
+
+
+//  Copies the values of w corresponding to the boundary
+//  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+
+void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
+{
+    const int //dx = 1,
+    dy = nx + 1,
+    bl  = 0,
+    br  = nx,
+    tl  = ny * (nx + 1),
+    tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+// ----------------------------------------------------------------------------------------------------------
+// Computes w:  = w + s at nodes on  the boundary
+// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
+
+void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
+{
+    int const dy = nx + 1,
+              bl  = 0,
+              br  = nx,
+              tl  = ny * (nx + 1),
+              tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+
+// ####################################################################
+
+Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
+    : Mesh(2, 3, 3),  // two dimensions, 3 vertices, 3 dofs
+      bedges(0)
+{
+    ifstream ifs(fname);
+    if (!(ifs.is_open() && ifs.good()))
+    {
+        cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
+        assert(ifs.is_open());
+    }
+
+    int const OFFSET(1);             // Matlab to C indexing
+    cout << "ASCI file  " << fname << "  opened" << endl;
+
+    // Read some mesh constants
+    int nnode, ndim, nelem, nvert_e;
+    ifs >> nnode >> ndim >> nelem >> nvert_e;
+    cout << nnode << "  " << ndim << "  " << nelem << "  " << nvert_e << endl;
+    assert(ndim == 2 && nvert_e == 3);
+
+    // Allocate memory
+    Resize_Coords(nnode, ndim);                 // coordinates in 2D [nnode][ndim]
+    Resize_Connectivity(nelem, nvert_e);        // connectivity matrix [nelem][nvert]
+
+    // Read ccordinates
+    auto &xc = GetCoords();
+    for (int k = 0; k < nnode * ndim; ++k)
+    {
+        ifs >> xc[k];
+    }
+
+    // Read connectivity
+    auto &ia = GetConnectivity();
+    for (int k = 0; k < nelem * nvert_e; ++k)
+    {
+        ifs >> ia[k];
+        ia[k] -= OFFSET;                // Matlab to C indexing
+    }
+
+    // additional read of boundary information (only start/end point)
+    int nbedges;
+    ifs >> nbedges;
+
+    bedges.resize(nbedges * 2);
+    for (int k = 0; k < nbedges * 2; ++k)
+    {
+        ifs >> bedges[k];
+        bedges[k] -= OFFSET;            // Matlab to C indexing
+    }
+
+    return;
+}
+
+// binary
+//{
+//ifstream ifs(fname, ios_base::in | ios_base::binary);
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: file opened" << file << endl;
+
+
+//}
+
+// binaryIO.cpp
+//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
+//{
+
+//ifstream ifs(file, ios_base::in | ios_base::binary);
+
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: Opened file " << file << endl;
+
+//int _size;
+
+//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));   // old: ifs.read((char*)&_size, sizeof(int));
+//cnt.resize(_size);
+//cout << "ReadBinMatrix: cnt size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//col.resize(_size);
+//cout << "ReadBinMatrix: col size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//ele.resize(_size);
+//cout << "ReadBinMatrix: ele size: " << _size << endl;
+
+
+//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
+//ifs.read((char*)col.data(), col.size() * sizeof(int));
+//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
+
+//ifs.close();
+//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
+
+//}
+
+
+std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
+{
+    vector<int> idx(bedges);                              // copy
+
+    sort(idx.begin(), idx.end());                         // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex3_benchmarks/geom.h
+++ b/ex3_benchmarks/geom.h
@ -0,0 +1,381 @@
+#ifndef GEOM_FILE
+#define GEOM_FILE
+#include <array>
+#include <functional>             // function; C++11
+#include <string>
+#include <vector>
+
+/**
+ * Basis class for finite element meshes.
+ */
+class Mesh
+{
+public:
+    /**
+     * Constructor initializing the members with default values.
+      *
+     * @param[in] ndim  space dimensions (dimension for coordinates)
+      * @param[in] nvert_e  number of vertices per element (dimension for connectivity)
+      * @param[in] ndof_e   degrees of freedom per element (= @p nvert_e for linear elements)
+     */
+    explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
+
+    /**
+     * Destructor.
+     *
+     * See clang warning on
+     * <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
+     */
+    virtual ~Mesh();
+
+    /**
+     * Number of finite elements in (sub)domain.
+     * @return number of elements.
+     */
+    int Nelems() const
+    {
+        return _nelem;
+    }
+
+    /**
+     * Global number of vertices for each finite element.
+     * @return number of vertices per element.
+     */
+    int NverticesElements() const
+    {
+        return _nvert_e;
+    }
+
+    /**
+     * Global number of degrees of freedom (dof) for each finite element.
+     * @return degrees of freedom per element.
+     */
+    int NdofsElement() const
+    {
+        return _ndof_e;
+    }
+
+    /**
+     * Number of vertices in mesh.
+     * @return number of vertices.
+     */
+    int Nnodes() const
+    {
+        return _nnode;
+    }
+
+    /**
+     * Space dimension.
+     * @return number of dimensions.
+     */
+    int Ndims() const
+    {
+        return _ndim;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nelem    number of elements
+      * @param[in] nvert_e  number of vertices per element
+     */
+    void Resize_Connectivity(int nelem, int nvert_e)
+    {
+        SetNelem(nelem);               // number of elements
+        SetNverticesElement(nvert_e);  // vertices per element
+        _ia.resize(nelem * nvert_e);
+    }
+
+    /**
+     * Read connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    const std::vector<int>  &GetConnectivity() const
+    {
+        return _ia;
+    }
+
+    /**
+     * Access/Change connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    std::vector<int>  &GetConnectivity()
+    {
+        return _ia;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nnodes    number of nodes
+      * @param[in] ndim      space dimension
+     */
+    void Resize_Coords(int nnodes, int ndim)
+    {
+        SetNnode(nnodes);       // number of nodes
+        SetNdim(ndim);          // space dimension
+        _xc.resize(nnodes * ndim);
+    }
+
+    /**
+     * Read coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    const std::vector<double> &GetCoords() const
+    {
+        return _xc;
+    }
+
+    /**
+     * Access/Change coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    std::vector<double> &GetCoords()
+    {
+        return _xc;
+    }
+
+    /**
+     * Calculate values in vector @p v via function @p func(x,y)
+     * @param[in] v     vector
+      * @param[in] func  function of (x,y) returning a double value.
+     */
+    void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
+
+    /**
+     * Prints the information for a finite element mesh
+     */
+    void Debug() const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    virtual std::vector<int> Index_DirichletNodes() const = 0;
+
+    /**
+     * Write vector @p v toghether with its mesh information to an ASCii file @p fname.
+      *
+      * The data are written in C-style.
+      *
+      * @param[in] fname  file name
+      * @param[in] v      vector
+     */
+    void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
+
+    /**
+     * Visualize @p v together with its mesh information via matlab or octave.
+      *
+      * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
+      * that are supported on your system.
+      *
+      * @param[in] v    vector
+      *
+      * @warning matlab files ascii_read_meshvector.m  visualize_results.m
+      *          must be in the executing directory.
+     */
+    void Visualize(std::vector<double> const &v) const;
+
+
+protected:
+    void SetNelem(int nelem)
+    {
+        _nelem = nelem;
+    }
+
+    void SetNverticesElement(int nvert)
+    {
+        _nvert_e = nvert;
+    }
+
+    void SetNdofsElement(int ndof)
+    {
+        _ndof_e = ndof;
+    }
+
+    void SetNnode(int nnode)
+    {
+        _nnode = nnode;
+    }
+
+    void SetNdim(int ndim)
+    {
+        _ndim = ndim;
+    }
+
+private:
+    int _nelem;         //!< number elements
+    int _nvert_e;       //!< number of vertices per element
+    int _ndof_e;        //!< degrees of freedom (d.o.f.) per element
+    int _nnode;         //!< number nodes/vertices
+    int _ndim;          //!< space dimension of the problem (1, 2, or 3)
+    std::vector<int> _ia;    //!< element connectivity
+    std::vector<double> _xc; //!< coordinates
+};
+
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_square: public Mesh
+{
+public:
+    /**
+     * Generates the f.e. mesh for the unit square.
+     *
+     * @param[in] nx    number of discretization intervals in x-direction
+     * @param[in] ny    number of discretization intervals in y-direction
+     * @param[in] myid  my MPI-rank / subdomain
+     * @param[in] procx number of ranks/subdomains in x-direction
+     * @param[in] procy number of processes in y-direction
+    */
+    Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
+
+    /**
+     * Destructor
+     */
+    ~Mesh_2d_3_square() override
+    {}
+
+    /**
+     * Set solution vector based on a tensor product grid in the rectangle.
+     * @param[in] u solution vector
+     */
+    void SetU(std::vector<double> &u) const;
+
+    /**
+     * Set right hand side (rhs) vector on a tensor product grid in the rectangle.
+     * @param[in] f rhs vector
+     */
+    void SetF(std::vector<double> &f) const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+    /**
+      * Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
+      * The file stores rowise the x- and y- coordinates together with the value from  @p u .
+      * The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] name basename of file name (file name will be extended by the rank number)
+      * @param[in] u    local vector
+      *
+      * @warning   Assumes tensor product grid in unit square; rowise numbered
+      *            (as generated in class constructor).
+      *            The output is provided for tensor product grid visualization
+      *            ( similar to Matlab-surf() ).
+      *
+      * @see Mesh_2d_3_square
+      */
+    void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
+
+    // here will still need to implement in the class
+    //  GetBound(), AddBound()
+    //  or better a generalized way with indices and their appropriate ranks for MPI communication
+
+private:
+    /**
+      * Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
+      * which is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[in] xl   x-coordinate of left boundary
+      * @param[in] xr   x-coordinate of right boundary
+      * @param[in] yb   y-coordinate of lower boundary
+      * @param[in] yt   y-coordinate of upper boundary
+      * @param[out] xc  coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
+      */
+
+    void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
+                              double xc[]);
+    /**
+      * Determines the element connectivity of linear triangular elements of a FEM discretization
+      * of a rectangle using @p nx x @p ny equidistant intervals for discretization.
+      * @param[in] nx   number of discretization intervals in x-direction
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[out] ia  element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
+      */
+    void GetConnectivityInRectangle(int nx, int ny, int ia[]);
+
+private:
+    int _myid;          //!< my MPI rank
+    int _procx;         //!< number of MPI ranks in x-direction
+    int _procy;         //!< number of MPI ranks in y-direction
+    std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
+    int _color;         //!< red/black coloring (checker board) of subdomains
+
+    double _xl;         //!< x coordinate of lower left  corner of square
+    double _xr;         //!< x coordinate of lower right corner of square
+    double _yb;         //!< y coordinate or lower left  corner of square
+    double _yt;         //!< y coordinate of upper right corner of square
+    int    _nx;         //!< number of intervals in x-direction
+    int    _ny;         //!< number of intervals in y-direction
+};
+
+// #################### still some old code (--> MPI) ############################
+/**
+ * Copies the values of @p w corresponding to boundary @p ib
+ * onto vector s.  South (ib==1), East (ib==2), North (ib==3), West (ib==4).
+ * The vector @p s has to be long enough!!
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in] w     vector for all nodes of local discretization
+ * @param[out] s    short vector with values on boundary @p ib
+*/
+// GH_NOTE: Absicherung bei s !!
+void GetBound(int ib, int nx, int ny, double const w[], double s[]);
+
+
+/**
+ * Computes @p w := @p w + @p s  at the interface/boundary nodes on the
+ * boundary @p ib .  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in,out] w vector for all nodes of local discretization
+ * @param[in] s     short vector with values on boundary @p ib
+*/
+void AddBound(int ib, int nx, int ny, double w[], double const s[]);
+
+// #################### Mesh from Matlab ############################
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_matlab: public Mesh
+{
+public:
+    /**
+     * Reads mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    explicit Mesh_2d_3_matlab(std::string const &fname);
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * @return index vector.
+      *
+      * @warning All boundary nodes are considered as Dirchlet nodes.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+private:
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    int Nnbedges() const
+    {
+        return static_cast<int>(bedges.size());
+    }
+
+    std::vector<int> bedges;     //!< boundary edges [nbedges][2] storing start/end vertex
+
+};
+
+#endif
--- a/ex3_benchmarks/getmatrix.cpp
+++ b/ex3_benchmarks/getmatrix.cpp
@ -0,0 +1,347 @@
+#include "getmatrix.h"
+#include "userset.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <vector>
+using namespace std;
+
+
+//  general routine for lin. triangular elements
+
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
+//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1],
+                 x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
+    ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
+    ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
+    ske[1][0] = ske[0][1];
+    ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
+    ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
+    ske[2][0] = ske[0][2];
+    ske[2][1] = ske[1][2];
+    ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    //fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
+}
+
+
+// general routine for lin. triangular elements,
+// non-symm. matrix
+// node numbering in element:  a s c e n d i n g   indices !!
+// GH: deprecated
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[])
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i],           // row ii (global index)
+                  id1 = id[ii],           // start and
+                  id2 = id[ii + 1];       // end of row ii in matrix
+        int ip  = id1;
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];
+            bool not_found = true;
+            do       // find entry jj (global index) in row ii
+            {
+                not_found = (ik[ip] != jj);
+                ++ip;
+            }
+            while (not_found && ip < id2);
+
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (not_found)     // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(!not_found);
+            }
+#endif
+            sk[ip - 1] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+
+
+
+
+// ####################################################################
+
+CRS_Matrix::CRS_Matrix(Mesh const &mesh)
+    : _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
+{
+    Derive_Matrix_Pattern();
+    return;
+}
+
+void CRS_Matrix::Derive_Matrix_Pattern()
+{
+    int const nelem(_mesh.Nelems());
+    int const ndof_e(_mesh.NdofsElement());
+    auto const &ia(_mesh.GetConnectivity());
+//  Determine the number of matrix rows
+    _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+
+//  Collect for each node those nodes it is connected to (multiple entries)
+//  Detect the neighboring nodes
+    vector< list<int> > cc(_nrows);             //  cc[i] is the  list of nodes a node i is connected to
+    for (int i = 0; i < nelem; ++i)
+    {
+        int const idx = ndof_e * i;
+        for (int k = 0; k < ndof_e; ++k)
+        {
+            list<int> &cck = cc.at(ia[idx + k]);
+            cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
+        }
+    }
+//  Delete the multiple entries
+    _nnz = 0;
+    for (auto &it : cc)
+    {
+        it.sort();
+        it.unique();
+        _nnz += static_cast<int>(it.size());
+        // cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout,"  ")); cout << endl;
+    }
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+//  copy CSR data
+    _id[0] = 0;                                 // begin of first row
+    for (size_t i = 0; i < cc.size(); ++i)
+    {
+        //cout << i << "   " << nid.at(i) << endl;;
+        const list<int> &ci = cc.at(i);
+        const auto nci = static_cast<int>(ci.size());
+        _id[i + 1] = _id[i] + nci; // begin of next line
+        copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
+    }
+
+    assert(_nnz == _id[_nrows]);
+    _sk.resize(_nnz);                      // Allocate memory for CSR column index vector
+    return;
+}
+
+
+void CRS_Matrix::Debug() const
+{
+//  ID points to first entry of row
+//  no symmetry assumed
+    cout << "\nMatrix  (nnz = " << _id[_nrows] << ")\n";
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        cout << "Row " << row << " : ";
+        int const id1 = _id[row];
+        int const id2 = _id[row + 1];
+        for (int j = id1; j < id2; ++j)
+        {
+            cout.setf(ios::right, ios::adjustfield);
+            cout << "[" << setw(2) << _ik[j] << "]  " << setw(4) << _sk[j] << "  ";
+        }
+        cout << endl;
+    }
+    return;
+}
+
+void CRS_Matrix::CalculateLaplace(vector<double> &f)
+{
+    assert(_mesh.NdofsElement() == 3);               // only for triangular, linear elements
+    //cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< endl;
+    assert(_nnz == _id[_nrows]);
+
+    for (int k = 0; k < _nrows; ++k)
+    {
+        _sk[k] = 0.0;
+    }
+    for (int k = 0; k < _nrows; ++k)
+    {
+        f[k] = 0.0;
+    }
+
+    double ske[3][3], fe[3];
+    //  Loop over all elements
+    auto const nelem = _mesh.Nelems();
+    auto const &ia   = _mesh.GetConnectivity();
+    auto const &xc   = _mesh.GetCoords();
+
+    for (int i = 0; i < nelem; ++i)
+    {
+        CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
+        //AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
+        AddElem_3(ia.data() + 3 * i, ske, fe, f);
+    }
+
+    //Debug();
+
+    return;
+}
+
+void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
+{
+    double const PENALTY = 1e6;
+    auto const idx = _mesh.Index_DirichletNodes();
+    int const nidx = static_cast<int>(idx.size());
+
+    for (int row = 0; row < nidx; ++row)
+    {
+        int const k = idx[row];
+        int const id1 = fetch(k, k); // Find diagonal entry of row
+        assert(id1 >= 0);
+        _sk[id1] += PENALTY;		// matrix weighted scaling feasible
+        f[k] += PENALTY * u[k];
+    }
+
+    return;
+}
+
+void CRS_Matrix::GetDiag(vector<double> &d) const
+{
+    assert( _nrows == static_cast<int>(d.size()) );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        const int ia = fetch(row, row); // Find diagonal entry of row
+        assert(ia >= 0);
+        d[row] = _sk[ia];
+    }
+    return;
+}
+
+bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
+{
+    bool bn = (nnode == _nrows);     // number of rows
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of rows" << endl;
+    }
+
+    bool bz = (id[nnode] == _nnz);   // number of non zero elements
+    if (!bz)
+    {
+        cout << "#########   Error: " << "number of non zero elements" << endl;
+    }
+
+    bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
+    if (!bd)
+    {
+        cout << "#########   Error: " << "row starts" << endl;
+    }
+
+    bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
+    if (!bk)
+    {
+        cout << "#########   Error: " << "column indices" << endl;
+    }
+
+    bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
+    if (!bv)
+    {
+        cout << "#########   Error: " << "values" << endl;
+    }
+
+    return bn && bz && bd && bk && bv;
+}
+
+
+void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = 0.0;
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi += _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+void CRS_Matrix::Defect(vector<double> &w,
+                        vector<double> const &f, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() && u.size() == f.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = f[row];
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi -= _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+int CRS_Matrix::fetch(int const row, int const col) const
+{
+    int const id2 = _id[row + 1];    // end   and
+    int       ip  = _id[row];        // start of recent row (global index)
+
+    while (ip < id2 && _ik[ip] != col)  // find index col (global index)
+    {
+        ++ip;
+    }
+    if (ip >= id2)
+    {
+        ip = -1;
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+        cout << "No column  " << col << "  in row  " << row << endl;
+        assert(ip >= id2);
+#endif
+    }
+    return ip;
+}
+
+
+void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i];           // row ii (global index)
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];        // column jj (global index)
+            int ip = fetch(ii, jj);       // find column entry jj in row ii
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (ip < 0)        // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(ip >= 0);
+            }
+#endif
+            _sk[ip] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
--- a/ex3_benchmarks/getmatrix.h
+++ b/ex3_benchmarks/getmatrix.h
@ -0,0 +1,178 @@
+#ifndef GETMATRIX_FILE
+#define GETMATRIX_FILE
+
+#include "geom.h"
+#include <vector>
+
+/**
+ * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions.
+ * @param[in]	ial	node indices of the three element vertices
+ * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
+ * @param[out] ske	element stiffness matrix
+ * @param[out] fe	element load vector
+ */
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
+
+/**
+ * Adds the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions to the appropriate positions in
+ * the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
+ *
+ * @param[in] ial   node indices of the three element vertices
+ * @param[in] ske	element stiffness matrix
+ * @param[in] fe	element load vector
+ * @param[out] sk	vector non-zero entries of CSR matrix
+ * @param[in] id	index vector containing the first entry in a CSR row
+ * @param[in] ik	column index vector of CSR matrix
+ * @param[out] f	distributed local vector storing the right hand side
+ *
+ * @warning Algorithm requires indices in connectivity @p ial in ascending order.
+ *          Currently deprecated.
+*/
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[]);
+
+
+// #####################################################################
+/**
+ * Square matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+class CRS_Matrix
+{
+    public:
+       /**
+        * Intializes the CRS matrix structure from the given discetization in @p mesh.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+        *
+        * @param[in] mesh given discretization
+        *
+        * @warning A reference to the discretization @p mesh is stored inside this class.
+        *          Therefore, changing @p mesh outside requires also
+        *          to call method @p Derive_Matrix_Pattern explicitely.
+        *
+        * @see Derive_Matrix_Pattern
+        */
+       explicit CRS_Matrix(Mesh const & mesh);
+
+      /**
+        * Destructor.
+        */
+       ~CRS_Matrix()
+       {}
+
+       /**
+        * Generates the sparse matrix pattern and overwrites the existing pattern.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+       */
+       void Derive_Matrix_Pattern();
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
+        * No memory is allocated.
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        */
+       void CalculateLaplace(std::vector<double> &f);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u.
+        *
+        * @param[in]     u (global) vector with Dirichlet data
+        * @param[in,out] f load vector
+        */
+       void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
+
+       /**
+        * Extracts the diagonal elemenst of the sparse matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       void GetDiag(std::vector<double> &d) const;
+
+       /**
+        * Performs the matrix-vector product  w := K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     u vector
+        */
+       void Mult(std::vector<double> &w, std::vector<double> const &u) const;
+
+        /**
+        * Calculates the defect/residuum w := f - K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u vector
+        */
+       void Defect(std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const;
+
+       /**
+		 * Number rows in matrix.
+		 * @return number of rows.
+		 */
+       int Nrows() const
+          {return _nrows;}
+
+       /**
+		 * Show the matrix entries.
+		 */
+       void Debug() const;
+
+       /**
+		 * Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       int fetch(int row, int col) const;
+
+      /**
+        * Adds the element stiffness matrix @p ske and the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     ske   element stiffness matrix
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes  linear triangular elements (ndof_e==3).
+       */
+       void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
+
+        /**
+        * Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
+        *
+        * The method prints statements on differences found.
+        *
+        * @param[in]     nnode  row number of external matrix
+        * @param[in]     id     start indices of matrix rows of external matrix
+        * @param[in]     ik     column indices of external matrix
+        * @param[in]     sk     non-zero values of external matrix
+        *
+        * @return true iff all data are identical.
+        */
+       bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
+
+    private:
+       Mesh const & _mesh;      //!< reference to discretization
+       int _nrows;              //!< number of rows in matrix
+       int _nnz;                //!< number of non-zero entries
+       std::vector<int> _id;    //!< start indices of matrix rows
+       std::vector<int> _ik;    //!< column indices
+       std::vector<double> _sk; //!< non-zero values
+
+};
+
+
+#endif
--- a/ex3_benchmarks/gh_code_Schratter.pdf
+++ b/ex3_benchmarks/gh_code_Schratter.pdf
--- a/ex3_benchmarks/gh_response.txt
+++ b/ex3_benchmarks/gh_response.txt
@ -0,0 +1,13 @@
+* added ../GCC_default.mk and ../CLANG_default.mk  into your repository
+* missing objects in Makefile, see Makefile:18            [-1]
+* benchmark_tests.cpp:214   
+  double check_sum       // GH: initialize!
+  
+ * Some comprehensive documents on results would be good. 
+ 
+ * benchmarks.cpp
+   MatMat(): swao j and k loop    // should be faster
+   minor remarks in pdf.
+   
+   
+* make codecheck COMPILER=CLANG_ > codecheck_gh.out
--- a/ex3_benchmarks/jacsolve.cpp
+++ b/ex3_benchmarks/jacsolve.cpp
@ -0,0 +1,61 @@
+#include "vdop.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+// #####################################################################
+// 	const int neigh[], const int color,
+// 	const MPI::Intracomm& icomm,
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    const int    maxiter = 1000;
+    const double tol  = 1e-5,                // tolerance
+                 tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k)
+    {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    double sigma0 = dscapr(w, r);            // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        sigma = dscapr(w, r);                // s0 := <w,r>
+//      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
--- a/ex3_benchmarks/jacsolve.h
+++ b/ex3_benchmarks/jacsolve.h
@ -0,0 +1,18 @@
+#ifndef JACSOLVE_FILE
+#define JACSOLVE_FILE
+#include "getmatrix.h"
+#include <vector>
+
+
+/**
+ * Solves linear system of equations  K @p u = @p f  via the Jacobi iteration.
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
+
+
+#endif
--- a/ex3_benchmarks/main.cpp
+++ b/ex3_benchmarks/main.cpp
@ -0,0 +1,101 @@
+#include "benchmarks.h"
+#include "benchmark_tests.h"
+#include "factorization_solve_tests.h"
+#include <iostream>
+
+int main()
+{   
+    // ---------------------------------- 1. ----------------------------------
+    // results in file "test_system.txt"
+    
+    
+
+    // ---------------------------------- 2. ----------------------------------
+    //      Memory      FLOPS       Read-write operations
+    // (A)  2N          2N          2N
+    // (B)  MN + N      2NM         2NM + M
+    // (C)  ML + LM     2LNM        2LNM + MN
+    // (D)  (p+1) + N   3N(p+1)     N(2 + 3(p+1))
+
+
+
+    // ---------------------------------- 3. ----------------------------------
+    // implementation in file "benchmarks.cpp"
+
+
+
+    // ---------------------------------- 4.& 6. ----------------------------------
+    vector<vector<double>> results;
+
+    results.push_back(test_A(250, 50000000, scalar));       
+    results.push_back(test_A(250, 50000000, Kahan_skalar));
+    results.push_back(test_A(250, 50000000, scalar_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // scalar           0.039   2.4     19
+    // Kahan_skalar     0.037   2.5     20
+    // scalar_cBLAS     0.032   2.9     23
+
+
+    results.push_back(test_B(100, 20000, 10000, MatVec));
+    results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // MatVec           0.1     3.6     29
+    // MatVec_cBLAS     0.074   5       40
+
+
+    results.push_back(test_C(25, 500, 1000, 1500, MatMat));
+    results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // MatMat           0.57    2.5     20
+    // MatMat_cBLAS     0.019   75      6e+02   // unrealistic
+
+
+    results.push_back(test_D(100, 100, 1000000));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    //                  0.11    2.5     20
+    
+
+    cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results.size(); ++i)
+        cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
+    cout << endl;
+    
+
+    
+    // ---------------------------------- 5. ----------------------------------
+    // 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
+    // Reason: only have to access entries of x, so less memory that has to be accessed
+    //
+    // 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
+
+
+
+    // ---------------------------------- 6. ----------------------------------
+    // see 4.
+
+
+    // ---------------------------------- 7. ----------------------------------
+    CheckCorrectness();
+    // Checked correctness by computing the inverse of A
+    
+    CheckDuration(5000);
+    // The solving time per RHS scales roughly with factor 1/n_rhs
+
+
+
+    // ---------------------------------- 8. ----------------------------------
+    // done seperately
+    
+
+
+
+
+
+    return 0;
+    
+}
--- a/ex3_benchmarks/small_Doxyfile
+++ b/ex3_benchmarks/small_Doxyfile
--- a/ex3_benchmarks/test_system.txt
+++ b/ex3_benchmarks/test_system.txt
@ -0,0 +1,70 @@
+ ---------------------------------- 1. ----------------------------------
+
+
+rm -f *.exe *.o
+gcc -O3    -c -o mysecond.o mysecond.c
+gcc -O3  -c mysecond.c
+gfortran -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20   -c stream.f
+gfortran -O3 stream.o mysecond.o -o stream_f.exe
+gcc -O3  -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20  stream.c -o stream_c.exe
+gcc -O3  -DUNIX flops.c -o flops.exe
+flops.c: In function ‘main’:
+flops.c:231:4: warning: implicit declaration of function ‘dtime’ [-Wimplicit-function-declaration]
+  231 |    dtime(TimeArray);
+      |    ^~~~~
+flops.c: At top level:
+flops.c:723:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
+  723 | dtime(p)
+      | ^~~~~
+./stream_c.exe
+-------------------------------------------------------------
+STREAM version $Revision: 5.10 $
+-------------------------------------------------------------
+This system uses 8 bytes per array element.
+-------------------------------------------------------------
+Array size = 80000000 (elements), Offset = 0 (elements)
+Memory per array = 610.4 MiB (= 0.6 GiB).
+Total memory required = 1831.1 MiB (= 1.8 GiB).
+Each kernel will be executed 20 times.
+ The *best* time for each kernel (excluding the first iteration)
+ will be used to compute the reported bandwidth.
+-------------------------------------------------------------
+Your clock granularity/precision appears to be 1 microseconds.
+Each test below will take on the order of 79294 microseconds.
+   (= 79294 clock ticks)
+Increase the size of the arrays if this shows that
+you are not getting at least 20 clock ticks per test.
+-------------------------------------------------------------
+WARNING -- The above is only a rough guideline.
+For best results, please be sure you know the
+precision of your system timer.
+-------------------------------------------------------------
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:           26720.6     0.057416     0.047903     0.098979
+Scale:          17008.6     0.087616     0.075256     0.133899
+Add:            19169.9     0.113818     0.100157     0.177676
+Triad:          19144.9     0.111248     0.100288     0.170877
+-------------------------------------------------------------
+Solution Validates: avg error less than 1.000000e-13 on all three arrays
+-------------------------------------------------------------
+./flops.exe
+
+   FLOPS C Program (Double Precision), V2.0 18 Dec 1992
+
+   Module     Error        RunTime      MFLOPS
+                            (usec)
+     1      4.0146e-13      0.0023   6183.8896
+     2     -1.4166e-13      0.0006  11377.1999
+     3      4.7184e-14      0.0034   5031.5222
+     4     -1.2557e-13      0.0031   4841.2566
+     5     -1.3800e-13      0.0056   5208.6586
+     6      3.2380e-13      0.0053   5484.2426
+     7     -8.4583e-11      0.0031   3832.8326
+     8      3.4867e-13      0.0055   5410.0375
+
+   Iterations      =  512000000
+   NullTime (usec) =     0.0000
+   MFLOPS(1)       =  8055.7366
+   MFLOPS(2)       =  4732.2658
+   MFLOPS(3)       =  5164.0037
+   MFLOPS(4)       =  5257.0181
--- a/ex3_benchmarks/userset.cpp
+++ b/ex3_benchmarks/userset.cpp
@ -0,0 +1,16 @@
+#include "userset.h"
+#include <cmath>
+
+
+double FunctF(double const x, double const y)
+{
+// return  std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
+//  return 16.0*1024. ;
+// return (double)1.0 ;
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+double FunctU(const double /* x */, double const /* y */)
+{
+    return 1.0 ;
+}
--- a/ex3_benchmarks/userset.h
+++ b/ex3_benchmarks/userset.h
@ -0,0 +1,44 @@
+#ifndef USERSET_FILE
+#define USERSET_FILE
+#include <cmath>
+/**
+ * User function: f(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for right hand side f(@p x,@p y)
+ */
+double FunctF(double const x, double const y);
+
+/**
+ * User function: u(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for solution vector u(@p x,@p y)
+ */
+double FunctU(double const x, double const y);
+
+
+/**
+ * User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value f(@p x,@p y)
+ */
+inline double fNice(double const x, double const y)
+{
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+/**
+ * User function: f(@p x,@p y) = 0$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value 0
+ */
+inline double f_zero(double const x, double const y)
+//double f_zero(double const  /*x*/, double const /*y*/)
+{
+    return 0.0 + 0.0*(x+y);
+}
+
+#endif
--- a/ex3_benchmarks/vdop.cpp
+++ b/ex3_benchmarks/vdop.cpp
@ -0,0 +1,84 @@
+#include "vdop.h"
+#include <cassert>               // assert()
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+void vddiv(vector<double> &x, vector<double> const &y,
+           vector<double> const &z)
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] / z[k];
+    }
+    return;
+}
+
+//******************************************************************************
+
+void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
+            double alpha, std::vector<double> const &z )
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] + alpha * z[k];
+    }
+    return;
+}
+//******************************************************************************
+
+double dscapr(std::vector<double> const &x, std::vector<double> const &y)
+{
+    assert( x.size() == y.size());
+    size_t n = x.size();
+
+    double    s = 0.0;
+    for (size_t k = 0; k < n; ++k)
+    {
+        s += x[k] * y[k];
+    }
+
+    return s;
+}
+
+//******************************************************************************
+void DebugVector(vector<double> const &v)
+{
+    cout << "\nVector  (nnode = " << v.size() << ")\n";
+    for (size_t j = 0; j < v.size(); ++j)
+    {
+        cout.setf(ios::right, ios::adjustfield);
+        cout << v[j] << "   ";
+    }
+    cout << endl;
+
+    return;
+}
+//******************************************************************************
+bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
+{
+    bool bn = (static_cast<int>(x.size()) == n);
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of elements" << endl;
+    }
+    //bool bv = equal(x.cbegin(),x.cend(),y);
+    bool bv = equal(x.cbegin(), x.cend(), y,
+                    [eps](double a, double b) -> bool
+    { return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
+                   );
+    if (!bv)
+    {
+        assert(static_cast<int>(x.size()) == n);
+        cout << "#########   Error: " << "values" << endl;
+    }
+    return bn && bv;
+}
--- a/ex3_benchmarks/vdop.h
+++ b/ex3_benchmarks/vdop.h
@ -0,0 +1,58 @@
+#ifndef VDOP_FILE
+#define VDOP_FILE
+#include <vector>
+
+/** @brief  Element-wise vector divison x_k = y_k/z_k.
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  z  source vector
+ *
+ */
+void vddiv(std::vector<double> & x, std::vector<double> const& y,
+                                    std::vector<double> const& z);
+
+/** @brief  Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  alpha  scalar
+ * @param[in]  z  source vector
+ *
+ */
+void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
+                       double alpha, std::vector<double> const& z );
+
+
+/** @brief  Calculates the Euclidian inner product of two vectors.
+ *
+ * @param[in]  x vector
+ * @param[in]  y vector
+ * @return Euclidian inner product @f$\langle x,y \rangle@f$
+ *
+ */
+double dscapr(std::vector<double> const& x, std::vector<double> const& y);
+
+
+/**
+ * Print entries of a vector.
+ * @param[in] v	    vector values
+*/
+void DebugVector(std::vector<double> const &v);
+
+/** @brief  Compares an STL vector with POD vector.
+ *
+ * The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
+ * follows the book by
+ * <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
+ *
+ * @param[in]  x    STL vector
+ * @param[in]  n    length of POD vector
+ * @param[in]  y    POD vector
+ * @param[in]  eps  relative accuracy criteria (default := 0.0).
+ * @return true iff pairwise vector elements are relatively close to each other.
+ *
+ */
+bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
+
+#endif
--- a/ex4/Code.pdf
+++ b/ex4/Code.pdf
--- a/ex4/ex_4A.py
+++ b/ex4/ex_4A.py
@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+np.set_printoptions(precision=2)
+
+
+N = 10   # number of elements
+a = 1
+alpha = 1
+g_b = 1
+f_const = 1
+
+x = np.linspace(0, 1, N + 1)
+
+
+A = np.zeros((N + 1, N + 1))
+f_vec = np.zeros(N + 1)
+
+for i in range(1, N + 1):
+    h = x[i] - x[i - 1]
+
+
+    a_11 =  1./h + a*h/3.
+    a_12 = -1./h + a*h/6.
+    a_21 = -1./h + a*h/6.
+    a_22 =  1./h + a*h/3
+                                      
+    A[i - 1, i - 1] += a_11
+    A[i - 1, i] += a_12
+    A[i, i - 1] += a_21
+    A[i, i] += a_22
+
+
+    f_vec[i] = f_const*h
+
+print("A =\n", A)
+
+# take Neumann data into account
+A[N, N] += alpha
+f_vec[N] += alpha*g_b
+
+
+# take Dirichlet data into account
+u_g = np.zeros(N + 1)
+u_g[0] = 0
+print("u_g =\n", u_g)
+
+# remove first row of A
+A_g = A[1:N+1, :]
+#print("A_g =\n", A_g)
+
+# remove first row of f_vec
+f_g = f_vec[1:N+1]
+
+# assemble RHS with dirichlet data
+f_g -= A_g.dot(u_g)
+#print(f_g)
+
+# matrix for the inner nodes (excluding nodes with dirichlet bcs)
+A_0 = A[1:N+1, 1:N+1]   
+#print(A_0)
+
+# solve for u_0 (free dofs)
+u_0 = np.linalg.solve(A_0, f_g)
+
+# assemble "u = u_0 + u_g"
+u = np.concatenate([[0], u_0])
+
+
+print("u =\n", u)
+
+plt.plot(x, u, '-')
+plt.xlabel('x')
+plt.ylabel('u_h(x)')
+plt.grid()
+plt.show()
--- a/ex4/ex_4B.py
+++ b/ex4/ex_4B.py
@ -0,0 +1,83 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+np.set_printoptions(precision=2)
+
+
+
+N = 2   # number of elements
+split = np.sqrt(2)/2
+#split = 0.5
+
+
+N_lower = round(N*split)            # number of elements in (0, split)
+print(N_lower, "elements below sqrt(2)/2")
+N_upper = N - N_lower               # number of elements in (split, 1)
+print(N_upper, "elements above sqrt(2)/2")
+
+assert(N == N_lower + N_upper)
+
+
+
+x_lower = np.linspace(0, split, N_lower + 1)
+x_upper = np.linspace(split, 1, N_upper + 1)
+x = np.concatenate([x_lower, x_upper[1:]])
+print(x)
+
+
+A = np.zeros((N + 1, N + 1))
+
+for i in range(1, N + 1):
+    h = x[i] - x[i - 1]
+
+    lam = 1
+    if(x[i] > split):
+        lam = 10
+    
+
+    a_11 = lam/h
+    a_12 = -lam/h
+    a_21 = -lam/h
+    a_22 = lam/h
+                                      
+    A[i - 1, i - 1] += a_11
+    A[i - 1, i] += a_12
+    A[i, i - 1] += a_21
+    A[i, i] += a_22
+
+print("A =\n", A)
+
+
+
+# take dirichlet data into account
+u_g = np.zeros(N + 1)
+u_g[0] = 0
+u_g[N] = 1
+print("u_g =\n", u_g)
+
+# remove first and last row of A
+A_g = A[1:N, :]
+#print("A_g =\n", A_g)
+
+# assemble RHS with dirichlet data
+f = -A_g.dot(u_g)
+#print(f)
+
+# matrix for the inner nodes (excluding nodes with dirichlet bcs)
+A_0 = A[1:N, 1:N]   
+#print(A_0)
+
+# solve for u_0 (free dofs)
+u_0 = np.linalg.solve(A_0, f)
+
+# assemble "u = u_0 + u_g"
+u = np.concatenate([[0], u_0, [1]])
+
+
+print("u =\n", u)
+
+plt.plot(x, u, '-')
+plt.xlabel('x')
+plt.ylabel('u_h(x)')
+plt.grid()
+plt.show()
--- a/ex4/ex_4C.py
+++ b/ex4/ex_4C.py
@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+np.set_printoptions(precision=2)
+
+
+p = 70
+N_vec = [10, 20, 30, 40, 70]
+for N in N_vec:
+
+    x = np.linspace(0, 1, N + 1)
+
+    A = np.zeros((N + 1, N + 1))
+
+    for i in range(1, N + 1):
+        h = x[i] - x[i - 1]
+
+
+        a_11 =  1./h - p/2.
+        a_12 = -1./h + p/2.
+        a_21 = -1./h - p/2.
+        a_22 =  1./h + p/2.
+                                            
+        A[i - 1, i - 1] += a_11
+        A[i - 1, i] += a_12
+        A[i, i - 1] += a_21
+        A[i, i] += a_22
+
+    print("A =\n", A)
+
+
+
+    # take dirichlet data into account
+    u_g = np.zeros(N + 1)
+    u_g[0] = 0
+    u_g[N] = 1
+    print("u_g =\n", u_g)
+
+    # remove first and last row of A
+    A_g = A[1:N, :]
+    #print("A_g =\n", A_g)
+
+    # assemble RHS with dirichlet data
+    f = -A_g.dot(u_g)
+    #print(f)
+
+    # matrix for the inner nodes (excluding nodes with dirichlet bcs)
+    A_0 = A[1:N, 1:N]   
+    #print(A_0)
+
+    # solve for u_0 (free dofs)
+    u_0 = np.linalg.solve(A_0, f)
+
+    # assemble "u = u_0 + u_g"
+    u = np.concatenate([[0], u_0, [1]])
+
+
+    print("u =\n", u)
+
+    plt.plot(x, u, '-')
+
+
+# plotting the exact solution
+plt.plot(x, (np.exp(p*x) - 1.)/(np.exp(p) - 1.))
+
+plt.xlabel('x')
+plt.ylabel('u_h(x)')
+plt.legend(N_vec + ['exact'])
+plt.title("Comparing discrete solution for increasing number of elements")
+plt.grid()
+plt.show()
+
+
+
+
+
--- a/ex4/ex_4_results.pdf
+++ b/ex4/ex_4_results.pdf
--- a/ex5/ex5_1/Makefile
+++ b/ex5/ex5_1/Makefile
@ -1,30 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp mylib.cpp
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-include ../${COMPILER}default.mk
--- a/ex5/ex5_1/main.cpp
+++ b/ex5/ex5_1/main.cpp
@ -1,142 +0,0 @@
-#include "check_env.h"
-#include "mylib.h"
-#include <cstdlib>          // atoi()
-#include <cstring>          // strncmp()
-#include <ctime>
-#include <iostream>
-#include <omp.h>            // OpenMP
-#include <sstream>
-#include <string>
-using namespace std;
-
-int main(int argc, char const *argv[])
-{
-    omp_set_schedule(omp_sched_static, 2000000);
-    //omp_set_schedule(omp_sched_dynamic, 1000000);
-    //omp_set_schedule(omp_sched_guided, 1000000);
-    //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
-
-
-    // Speedup for different number of cores (incl. hyperthreading)
-    omp_set_num_threads(8);
-
-    // Print number of available processors
-    cout << "Number of available processors: " << omp_get_num_procs() << endl;
-
-    // Currently executing parallel code? -> no
-    cout << "Currently in parallel? " << omp_in_parallel() << endl;
-
-
-    int const NLOOPS = 10;        // chose a value such that the benchmark runs at least 10 sec.
-    unsigned int N = 500000001;
-
-
-//##########################################################################
-//   Read Parameter from command line  (C++ style)
-    cout << "Checking command line parameters for: -n <number> " << endl;
-    for (int i = 1; i < argc; i++)
-    {
-        cout << " arg[" << i << "] = " << argv[i] << endl;
-        string ss(argv[i]);
-        if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
-        {
-            N = static_cast<unsigned int>(atoi(argv[i + 1]));
-        }
-        else
-        {
-            cout << "Corect call: " << argv[0] << " -n  <number>\n";
-        }
-    }
-
-    cout << "\nN = " << N << endl;
-    
-    check_env(argc, argv);
-//########################################################################
-    int nthreads;                                  // OpenMP
-    #pragma omp parallel default(none) shared(cout,nthreads)
-    {
-        stringstream inparallel;
-        inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
-
-
-        int const th_id  = omp_get_thread_num();   // OpenMP
-        int const nthrds = omp_get_num_threads();  // OpenMP
-        stringstream ss;
-        ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
-        #pragma omp critical
-        {
-            cout << ss.str();                      // output to a shared ressource
-            cout << inparallel.str() << endl;
-        }
-        #pragma omp master
-        nthreads = nthrds;                         // transfer nn to to master thread
-    }
-    cout << "   " << nthreads << "   threads have been started." << endl;
-
-//##########################################################################
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> x(N), y(N);
-
-    cout.precision(2);
-    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-//##########################################################################
-//  Data initialization
-//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
-    for (unsigned int i = 0; i < N; ++i)
-    {
-        x[i] = i + 1;
-        y[i] = 1.0 / x[i];
-    }
-
-//##########################################################################
-    cout << "\nStart Benchmarking\n";
-
-// Do calculation
-    double tstart = omp_get_wtime();                  // OpenMP
-
-    double sk(0.0);
-    for (int i = 0; i < NLOOPS; ++i)
-    {
-        //sk = scalar(x, y);
-        sk = scalar_parallel(x, y);
-        //sk = scalar_trans(x, y);
-        //sk = norm(x);
-    }
-
-    double t1 = omp_get_wtime() - tstart;             // OpenMP
-
-    t1 /= NLOOPS;           // divide by number of function calls
-
-//##########################################################################
-// Check the correct result
-    cout << "\n <x,y> = " << sk << endl;
-    if (static_cast<unsigned int>(sk) != N)
-    {
-        cout << "  !!   W R O N G  result   !!\n";
-    }
-    cout << endl;
-
-//##########################################################################
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-    cout << "Total benchmarking time: " << t1*NLOOPS << endl;
-    cout << "Timing in sec. : " << t1 << endl;
-    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
-    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
-
-//#########################################################################
-
-    cout << "\n  Try the reduction with an STL-vektor \n";
-    
-    auto vr = reduction_vec_append(5);
-    cout << "done\n";
-    cout << vr << endl;
-
-
-    return 0;
-}  // memory for x and y will be deallocated their destructors
--- a/ex5/ex5_1/mylib.cpp
+++ b/ex5/ex5_1/mylib.cpp
@ -1,137 +0,0 @@
-#include "mylib.h"
-#include <cassert>       // assert()
-#include <cmath>
-#include <iostream>
-#include <functional>    // multiplies<>{}
-#include <list>
-#include <numeric>       // iota()
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include <vector>
-using namespace std;
-
-
-double scalar_parallel(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
-    {
-        const size_t nthreads = omp_get_num_threads();
-        const size_t threadnum = omp_get_thread_num();
-        const size_t chunksize = N/nthreads;
-
-
-        size_t start = threadnum*chunksize;
-        size_t end = start + chunksize;
-        if (threadnum == nthreads - 1)
-            end = N;
-
-
-        for (size_t i = start; i < end; ++i)
-        {
-            sum += x[i] * y[i];
-        }
-        
-    }
-    return sum;
-}
-
-
-vector<int> reduction_vec_append(int n)
-{ 
-    vector<int> vec(n);
-#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
-    {
-        #pragma omp barrier
-        #pragma omp critical
-        cout << omp_get_thread_num() << " : " << vec.size() << endl;
-        #pragma omp barrier
-        iota( vec.begin(),vec.end(), omp_get_thread_num() );
-        #pragma omp barrier
-
-    }
-    return vec;
-}
-
-
-
-
-
-
-
-
-
-
-
-double scalar(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i] * y[i];
-        //sum += exp(x[i])*log(y[i]);
-    }
-    return sum;
-}
-
-
-double norm(vector<double> const &x)
-{
-    size_t const N = x.size();
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i]*x[i];
-    }
-    return sum;
-}
-
-
-vector<int> reduction_vec(int n)
-{ 
-    vector<int> vec(n);
-#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
-    {
-        #pragma omp barrier
-        #pragma omp critical
-        cout << omp_get_thread_num() << " : " << vec.size() << endl;
-        #pragma omp barrier
-        iota( vec.begin(),vec.end(), omp_get_thread_num() );
-        #pragma omp barrier
-
-    }
-    return vec;
-}
-
-
-double scalar_trans(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
-    vector<double> z(x.size());
-    //list<double> z(x.size());  // parallel for-loop on iterators not possible (missing 'operator-')
-                                 // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison 
-    
-    transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
-    
-    double sum = 0.0;
-#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
-    for (auto pi = cbegin(z); pi!=cend(z); ++pi)
-    {
-        sum += *pi;
-    }
-    //for (auto val: z)
-    //{
-        //sum += val;
-    //}    
-    return sum;
-}
-
-
-
--- a/ex5/ex5_1/mylib.h
+++ b/ex5/ex5_1/mylib.h
@ -1,88 +0,0 @@
-#pragma once
-#include <cassert>
-#include <iomanip>             // setw()
-#include <iostream>
-#include <omp.h>
-#include <vector>
-
-/** 	Inner product
-	@param[in] x	vector
-	@param[in] y	vector
-	@return 	    resulting Euclidian inner product <x,y>
-*/
-double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
-double scalar(std::vector<double> const &x, std::vector<double> const &y);
-double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
-
-
-
-// Declare additional reduction operation in OpenMP for STL-vector
-#pragma omp declare reduction(VecAppend : std::vector<int>  : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
- initializer (omp_priv=omp_orig)
- 
-std::vector<int> reduction_vec_append(int n);
-
-
-/** 	l2-norm
-	@param[in] x	vector
-	@return 	    resulting Euclidian norm
-*/
-double norm(std::vector<double> const &x);
-
-/** 	 Vector @p b adds its elements to vector @p a .
-	@param[in] a	vector
-	@param[in] b	vector
-	@return 	     a+=b componentwise
-*/
-template<class T>
-std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
-{
-    assert(a.size()==b.size());
-    for (size_t k = 0; k < a.size(); ++k) {
-        a[k] += b[k];
-    }
-    return a;
-}
-
-// Declare the reduction operation in OpenMP for an STL-vector
-//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
-// ------------------------------------------------------------
-// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
-// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
-// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
-//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
-#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
- initializer (omp_priv=omp_orig)
-
-//   Templates are  n o t  possible, i.e. the reduction has to be declared fore a specified type.
-//template <class T>
-//#pragma omp declare reduction(VecAdd : std::vector<T>  : omp_out += omp_in) initializer (omp_priv(omp_orig))
-// MS: template nach #pragma  !?
-
-// ------------------------------------------------------------
-
-/** 	Test for vector reduction.
- * 
- * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
- * Afterwards these vectors are accumulated, i.e., 
- * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
- * 
- * 	@param[in] n  size of global/private vector
- * 	@return  resulting global vector.
-*/
-std::vector<int> reduction_vec(int n);
-
-
-
-/** 	Output of a vector.
-	@param[in,out] s	output stream
-	@param[in]     x	vector
-	@return 	   modified output stream
-*/
-template <class T>
-std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
-{
-    for (auto const &v : x)  s << std::setw(4) << v << "  ";
-    return s;
-}
-
--- a/ex5/ex5_2/Makefile
+++ b/ex5/ex5_2/Makefile
@ -1,31 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp mylib.cpp
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-
-include ../${COMPILER}default.mk
--- a/ex5/ex5_2/data_1.txt
+++ b/ex5/ex5_2/data_1.txt
@ -1,501 +0,0 @@
-141
-261
-87
-430
-258
-298
-425
-120
-496
-707
-244
-786
-75
-394
-4
-221
-2
-190
-143
-269
-175
-139
-599
-902
-940
-222
-483
-377
-524
-265
-69
-437
-174
-27
-955
-431
-962
-763
-8
-681
-706
-646
-553
-219
-773
-229
-371
-891
-857
-403
-319
-609
-911
-910
-592
-333
-854
-443
-905
-34
-533
-717
-180
-337
-188
-322
-404
-549
-49
-553
-275
-242
-244
-155
-957
-936
-819
-729
-176
-361
-189
-2
-317
-700
-626
-544
-440
-288
-502
-762
-763
-577
-748
-646
-124
-505
-348
-93
-148
-199
-673
-432
-695
-257
-10
-533
-280
-947
-907
-393
-25
-672
-838
-972
-57
-451
-583
-687
-720
-651
-727
-374
-582
-117
-58
-980
-285
-595
-963
-186
-194
-342
-933
-391
-274
-152
-398
-375
-132
-436
-92
-615
-11
-574
-790
-236
-449
-570
-62
-497
-643
-222
-838
-972
-847
-506
-279
-747
-237
-958
-621
-601
-173
-91
-256
-859
-912
-700
-726
-230
-577
-811
-404
-989
-90
-321
-512
-61
-726
-557
-530
-830
-859
-790
-318
-453
-753
-110
-110
-270
-525
-973
-711
-312
-292
-851
-912
-640
-256
-89
-839
-585
-949
-62
-585
-286
-828
-191
-443
-394
-827
-677
-208
-319
-134
-672
-571
-170
-148
-477
-909
-553
-33
-54
-806
-452
-383
-790
-365
-533
-712
-872
-329
-651
-975
-76
-588
-414
-310
-264
-759
-996
-187
-782
-196
-993
-803
-425
-729
-499
-809
-357
-74
-591
-911
-194
-433
-750
-40
-947
-764
-559
-184
-498
-518
-995
-855
-963
-679
-404
-935
-480
-232
-397
-706
-559
-757
-996
-963
-536
-964
-116
-52
-305
-581
-531
-902
-541
-432
-543
-713
-17
-801
-143
-479
-257
-370
-662
-170
-279
-199
-196
-327
-881
-472
-404
-180
-969
-408
-845
-616
-377
-878
-785
-465
-814
-899
-430
-335
-597
-902
-703
-378
-735
-955
-543
-541
-312
-72
-182
-93
-464
-10
-916
-643
-2
-31
-209
-455
-128
-9
-728
-355
-781
-437
-437
-50
-50
-92
-595
-242
-842
-858
-964
-489
-221
-227
-537
-763
-348
-462
-640
-918
-162
-716
-578
-434
-885
-394
-179
-634
-625
-328
-803
-1000
-981
-128
-233
-24
-608
-111
-408
-885
-549
-370
-209
-441
-957
-125
-471
-857
-44
-692
-979
-284
-134
-686
-910
-611
-900
-194
-755
-347
-419
-156
-820
-625
-739
-806
-68
-951
-498
-756
-743
-832
-157
-458
-619
-933
-836
-896
-583
-583
-855
-35
-886
-408
-37
-747
-155
-144
-606
-255
-325
-402
-407
-387
-610
-167
-189
-95
-324
-770
-235
-741
-693
-825
-828
-294
-310
-524
-326
-832
-811
-557
-263
-681
-234
-457
-385
-539
-992
-756
-981
-235
-529
-52
-757
-602
-858
-989
-930
-410
-1
-541
-208
-220
-326
-96
-748
-749
-544
-339
-833
-553
-958
-893
-357
-547
-347
-623
-797
-746
-126
-823
-26
-415
-732
-782
-368
-
--- a/ex5/ex5_2/main.GCC_
+++ b/ex5/ex5_2/main.GCC_
--- a/ex5/ex5_2/main.cpp
+++ b/ex5/ex5_2/main.cpp
@ -1,130 +0,0 @@
-#include "mylib.h"
-#include <fstream>
-#include <iostream>
-#include <omp.h>
-#include <vector>
-using namespace std;
-
-
-int main()
-{
-    // read vector from file
-    vector<size_t> data_vector = {};
-
-    ifstream input_stream("data_1.txt"); 
-
-    size_t line;
-    while(input_stream >> line)
-    {
-        data_vector.push_back(line);
-    }
-    data_vector.shrink_to_fit();
-
-
-
-
-    // specify loops
-    size_t NLOOPS = 10000;
-
-
-
-    // ############# Parallelization with openMP #############
-    // calculate arithmetic mean, geometric mean and harmonic mean
-    double am_omp, gm_omp, hm_omp;
-
-    double tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        means_omp(data_vector, am_omp, gm_omp, hm_omp);
-
-    double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
-
-    // calculate minimum and maximum
-    size_t min, max;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        minmax_omp(data_vector, min, max);
-
-    double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
-
-
-
-    
-
-
-    // ############# Parallelization with C++ algorithms #############
-    // calculate arithmetic mean, geometric mean and harmonic mean
-    double am_cpp, gm_cpp, hm_cpp;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
-
-    double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
-
-    // calculate minimum and maximum
-    size_t min_cpp, max_cpp;
-
-    tstart = omp_get_wtime();
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-        minmax_cpp(data_vector, min_cpp, max_cpp);
-
-    double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
-
-
-
-
-
-    // print results
-    cout << "####### OpenMP #######" << endl;
-    cout << "minimum: " << min << endl;
-    cout << "maximum: " << max << endl;
-    cout << "duration: " << t_minmax_omp << endl << endl;
-
-    cout << "arithmetic mean: " << am_omp << endl;
-    cout << "geometric mean: " << gm_omp << endl;
-    cout << "harmonic mean: " << hm_omp << endl;
-    cout << "duration: " << t_means_omp << endl << endl;
-
-
-    cout << "####### C++ #######" << endl;
-    cout << "minimum: " << min_cpp << endl;
-    cout << "maximum: " << max_cpp << endl;
-    cout << "duration: " << t_minmax_cpp << endl << endl;
-
-    cout << "arithmetic mean: " << am_cpp << endl;
-    cout << "geometric mean: " << gm_cpp << endl;
-    cout << "harmonic mean: " << hm_cpp << endl;
-    cout << "duration: " << t_means_cpp << endl << endl;
-
-
-
-    // ####### OpenMP #######
-    // minimum: 1
-    // maximum: 1000
-    // duration: 3.52086e-06
-
-    // arithmetic mean: 498.184
-    // geometric mean: 364.412
-    // harmonic mean: 95.6857
-    // duration: 5.90171e-06
-
-    // ####### C++ #######
-    // minimum: 1
-    // maximum: 1000
-    // duration: 1.76816e-05
-
-    // arithmetic mean: 498.184
-    // geometric mean: 364.412
-    // harmonic mean: 95.6857
-    // duration: 2.35728e-05
-
-    // --> the openMP variant is faster in both cases
-
-
-    return 0;
-}
--- a/ex5/ex5_2/main.o
+++ b/ex5/ex5_2/main.o
--- a/ex5/ex5_2/mylib.cpp
+++ b/ex5/ex5_2/mylib.cpp
@ -1,103 +0,0 @@
-#include "mylib.h"
-#include <algorithm>
-#include <cmath>
-#include <execution>
-#include <iostream>
-#include <numeric>
-#include <omp.h>
-
-using namespace std;
-
-
-void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
-{
-    size_t const n = numbers.size();
-
-    am = 0.;
-    gm = 0.;
-    hm = 0.;
-
-#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
-    for (size_t i = 0; i < n; ++i)
-    {
-        am += numbers[i];
-        gm += log(numbers[i]);
-        hm += 1.0/numbers[i];
-
-        // #pragma omp critical
-        // {
-        //     cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
-        // }
-    }
-
-    am /= n;
-    gm = exp(gm/n);
-    hm = n/hm;
-}
-
-
-void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
-{
-    size_t const n = numbers.size();
-
-    global_min = -1; // gives the maximum size_t value
-    global_max = 0;
-
-#pragma omp parallel shared(numbers, n, global_min, global_max)
-    {
-        const size_t nthreads = omp_get_num_threads();
-        const size_t threadnum = omp_get_thread_num();
-        const size_t chunksize = n/nthreads;
-
-
-        size_t start = threadnum*chunksize;
-        size_t end = start + chunksize;
-        if (threadnum == nthreads - 1)
-            end = n;
-
-
-        size_t local_min = -1;
-        size_t local_max = 0;
-        for (size_t i = start; i < end ; ++i)
-        {
-            if (numbers[i] < local_min)
-                local_min = numbers[i];
-
-            if (numbers[i] > local_max)
-                local_max = numbers[i];
-        }
-
-        #pragma omp critical
-        {
-            if (local_min < global_min)
-                global_min = local_min;
-            
-            if (local_max > global_max)
-                global_max = local_max;
-        }
-    }
-}
-
-
-void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
-{
-    size_t const n = numbers.size();
-
-    am = reduce(std::execution::par, numbers.begin(), numbers.end());
-    gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
-    hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
-
-    am /= n;
-    gm = exp(gm/n);
-    hm = n/hm;
-}
-
-
-void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
-{
-    auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
-    auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
-    
-    global_min = *min_it;
-    global_max = *max_it;
-}
--- a/ex5/ex5_2/mylib.h
+++ b/ex5/ex5_2/mylib.h
@ -1,42 +0,0 @@
-#include <vector>
-
-/**
-  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
-  Uses openMP parallelization.
-  @param[in]    numbers      vector containing integers
-  @param[out]   am           arithmetic mean
-  @param[out]   gm           geometric mean
-  @param[out]   hm           harmonic mean
-*/
-void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
-
-
-/**
-  This function calculates the minimum and maximum of a vector.
-  Uses openMP parallelization.
-  @param[in]    numbers         vector containing integers
-  @param[out]   global_min      minimum
-  @param[out]   global_max      maximum
-*/
-void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
-
-
-/**
-  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
-  Uses C++ parallelization.
-  @param[in]    numbers      vector containing integers
-  @param[out]   am           arithmetic mean
-  @param[out]   gm           geometric mean
-  @param[out]   hm           harmonic mean
-*/
-void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
-
-
-/**
-  This function calculates the minimum and maximum of a vector.
-  Uses C++ parallelization.
-  @param[in]    numbers         vector containing integers
-  @param[out]   global_min      minimum
-  @param[out]   global_max      maximum
-*/
-void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
--- a/ex5/ex5_2/mylib.o
+++ b/ex5/ex5_2/mylib.o
--- a/ex5/ex5_3/Makefile
+++ b/ex5/ex5_3/Makefile
@ -1,30 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp goldbach.cpp 
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-include ../${COMPILER}default.mk
--- a/ex5/ex5_3/goldbach.h
+++ b/ex5/ex5_3/goldbach.h
@ -1,45 +0,0 @@
-#pragma once
-#include "mayer_primes.h"
-#include <cassert>
-#include <vector>
-
-
-/**
-  This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
-  @param[in]    k            first integer
-  @param[out]   count        number of decompositions
-*/
-size_t single_goldbach(size_t k);
-
-
-/**
-  This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
-  @param[in]    n                   upper integer bound
-  @param[out]   count_vector        vector containing the number of decompositions for a natural number the corresponding index
-*/
-std::vector<size_t> count_goldbach(size_t n);
-
-
-/** 	 Vector @p b adds its elements to vector @p a .
-	@param[in] a	vector
-	@param[in] b	vector
-	@return 	     a+=b componentwise
-*/
-template<class T>
-std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
-{
-    assert(a.size()==b.size());
-    for (size_t k = 0; k < a.size(); ++k) {
-        a[k] += b[k];
-    }
-    return a;
-}
-
-// Declare the reduction operation in OpenMP for an STL-vector
-//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
-// ------------------------------------------------------------
-// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
-// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
-// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
-//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
-#pragma omp declare reduction(VecAdd : std::vector<size_t>  : omp_out += omp_in) initializer (omp_priv=omp_orig)
--- a/ex5/ex5_3/main.cpp
+++ b/ex5/ex5_3/main.cpp
@ -1,45 +0,0 @@
-#include "goldbach.h"
-#include <algorithm>
-#include <iostream>
-#include <omp.h>
-using namespace std;
-
-
-int main()
-{
-    cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
-
-    for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
-    {
-        double t_start = omp_get_wtime();
-
-        auto goldbach_vector = count_goldbach(n);
-
-        auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
-        size_t max_number = distance(goldbach_vector.begin(), max_it);
-
-        double t_end = omp_get_wtime() - t_start;
-
-        cout << "The number " <<  max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
-    }
-
-    /*
-    ###### WITHOUT PARALLELIZATION ######
-    The number 9240 has 329 decompositions. Duration: 0.00307696
-    The number 99330 has 2168 decompositions. Duration: 0.189839
-    The number 390390 has 7094 decompositions. Duration: 1.3042
-    The number 990990 has 15594 decompositions. Duration: 5.45034
-    The number 1981980 has 27988 decompositions. Duration: 47.1807
-
-
-    ###### WITH PARALLELIZATION ######
-    The number 9240 has 329 decompositions. Duration: 0.000734854
-    The number 99330 has 2168 decompositions. Duration: 0.0251322
-    The number 390390 has 7094 decompositions. Duration: 0.487375
-    The number 990990 has 15594 decompositions. Duration: 6.16972
-    The number 1981980 has 27988 decompositions. Duration: 31.5699
-    */
-
-
-    return 0;
-}
--- a/ex5/ex5_3/mayer_primes.h
+++ b/ex5/ex5_3/mayer_primes.h
@ -1,73 +0,0 @@
-#pragma once
-
-#include <cstring> //memset
-#include <vector>
-//using namespace std;
-
-/** \brief Determines all prime numbers in interval [2, @p max].
- *
- *  The sieve of Eratosthenes is used.
- *
- *  The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
- *
- * \param[in]   max end of interval for the prime number search.
- * \return  vector of prime numbers @f$2,3,5, ..., p<=max @f$.
- *
- * \copyright
- * Copyright (c) 2008 Florian Mayer          (adapted by Gundolf Haase 2018)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-template <class T>
-std::vector<T> get_primes(T max)
-{
-    std::vector<T> primes;
-    char *sieve;
-    sieve = new char[max / 8 + 1];
-    // Fill sieve with 1
-    memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
-    for (T x = 2; x <= max; x++)
-    {
-        if (sieve[x / 8] & (0x01 << (x % 8))) {
-            primes.push_back(x);
-            // Is prime. Mark multiplicates.
-            for (T j = 2 * x; j <= max; j += x)
-            {
-                sieve[j / 8] &= ~(0x01 << (j % 8));
-			}
-        }
-	}
-    delete[] sieve;
-    return primes;
-}
-
-//---------------------------------------------------------------
-//int main()        // by  Florian Mayer
-//{g++ -O3  -std=c++14 -fopenmp main.cpp && ./a.out
-//    vector<unsigned long> primes;
-//    primes = get_primes(10000000);
-//    // return 0;
-//    // Print out result.
-//    vector<unsigned long>::iterator it;
-//    for(it=primes.begin(); it < primes.end(); it++)
-//        cout << *it << " ";
-//
-//    cout << endl;
-//    return 0;
-//}
--- a/ex5/ex5_4/Makefile
+++ b/ex5/ex5_4/Makefile
@ -1,30 +0,0 @@
-#
-# use GNU-Compiler tools
-COMPILER=GCC_
-# alternatively from the shell
-# export COMPILER=GCC_
-# or, alternatively from the shell
-# make COMPILER=GCC_
-
-# use Intel compilers
-#COMPILER=ICC_
-
-# use PGI compilers
-# COMPILER=PGI_
-
-
-SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
-OBJECTS = $(SOURCES:.cpp=.o)
-
-PROGRAM	= main.${COMPILER}
-
-# uncomment the next to lines for debugging and detailed performance analysis
-CXXFLAGS += -g
-LINKFLAGS += -g
-# do not use -pg with PGI compilers
-
-ifndef COMPILER
-  COMPILER=GCC_
-endif
-
-include ../${COMPILER}default.mk
--- a/ex5/ex5_4/benchmark_tests.cpp
+++ b/ex5/ex5_4/benchmark_tests.cpp
@ -1,375 +0,0 @@
-#include "benchmark_tests.h"
-#include "benchmarks.h"
-#include <chrono>
-#include <iostream>
-#include <math.h>       
-using namespace std::chrono; 
-
-vector<double> test_A(const size_t &NLOOPS, const size_t &N)
-{
-    cout << "#################### (A) ####################" << endl;
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nN = " << N << endl;
-
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> x(N), y(N);
-
-    cout.precision(2);
-    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-
-//  Data initialization
-//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
-
-    for (size_t i = 0; i < N; ++i)
-    {
-        x[i] = i % 219 + 1;
-        y[i] = 1.0/x[i];
-    }
-
-
-    cout << "\nStart Benchmarking scalar\n";
-
-    auto t1 = system_clock::now(); // start timer
-// Do calculation
-    double check(0.0),ss(0.0);
-    for (size_t i = 0; i < NLOOPS; ++i)
-    {
-        check = scalar_parallel(x, y);
-        ss += check;                   // prevents the optimizer from removing unused calculation results.
-    }
-
-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
-    
-
-
-// Check the correct result
-    cout << "\n <x,y> = " << check << endl;
-    if (static_cast<unsigned int>(check) != N)
-        cout << "  !!   W R O N G  result   !!\n";
-    cout << endl;
-
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-
-
-    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
-
-vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
-{
-    cout << "#################### (A) sum ####################" << endl;
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nN = " << N << endl;
-
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> x(N);
-
-    cout.precision(2);
-    cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-
-//  Data initialization
-
-    for (size_t i = 0; i < N; ++i)
-    {
-        x[i] = 1;
-    }
-
-
-    cout << "\nStart Benchmarking sum\n";
-
-    auto t1 = system_clock::now(); // start timer
-// Do calculation
-    double check(0.0),ss(0.0);
-    for (size_t i = 0; i < NLOOPS; ++i)
-    {
-        check = sum(x);
-        ss += check;                   // prevents the optimizer from removing unused calculation results.
-    }
-
-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
-    
-
-
-// Check the correct result
-    cout << "\n <x,y> = " << check << endl;
-    if (static_cast<unsigned int>(check) != N)
-        cout << "  !!   W R O N G  result   !!\n";
-    cout << endl;
-
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-
-
-    double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
-
-
-vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
-{
-    cout << "#################### (B) ####################" << endl;
-
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nN = " << N << endl;
-    cout << "\nM = " << M << endl;
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> A(M*N);
-    vector<double> x(N);
-
-    cout.precision(2);
-    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-//  Data initialization
-
-    for (size_t i = 0; i < M; ++i)
-        for (size_t j = 0; j < N; ++j)
-            A[N*i + j] = (i + j) % 219 + 1;
-
-
-    for (size_t j = 0; j < N; ++j)
-    {
-        x[j] = 1.0/A[N*17 + j];
-    }
-
-    cout << "\nStart Benchmarking MatVec\n";
-
-    auto t1 = system_clock::now(); // start timer
-// Do calculation
-    vector<double> b(M);
-    
-    for (size_t i = 0; i < NLOOPS; ++i)
-    {
-        b = MatVec_parallel(A, x);
-    }
-
-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
-
-
-// Check the correct result
-    cout << "\n <A[17,*],x> = " << b[17] << endl;
-    if (static_cast<size_t>(b[17]) != N)
-    {
-        cout << "  !!   W R O N G  result   !!\n";
-    }
-    cout << endl;
-
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-
-    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
-
-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
-
-
-vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
-{
-    cout << "#################### (C) ####################" << endl;
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nL = " << L << endl;
-    cout << "\nM = " << M << endl;
-    cout << "\nN = " << N << endl;
-
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> A(M*L);
-    vector<double> B(L*N);
-
-    cout.precision(2);
-    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-
-//  Data initialization
-
-    for (size_t i = 0; i < M; ++i)
-        for (size_t k = 0; k < L; ++k)
-            A[L*i + k] = (i + k) % 219 + 1;
-
-    for (size_t k = 0; k < L; ++k)
-        for (size_t j = 0; j < N; ++j)
-            B[N*k + j] = 1.0/A[L*17 + k];
-
-
-    cout << "\nStart Benchmarking MatMat\n";
-
-    auto t1 = system_clock::now(); // start timer
-// Do calculation
-    vector<double> C(M*N);
-    double check;
-    double check_sum = 0;
-
-    for (size_t i = 0; i < NLOOPS; ++i)
-    {
-        C = MatMat_parallel(A, B, L);
-
-        check = C[N*17];
-        check_sum += check; // prevents the optimizer from removing unused calculation results.
-    }
-    cout << check_sum;
-
-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
-
-
-// Check the correct result
-    cout << "\n C[17,0] = " << check << endl;
-    if (static_cast<unsigned int>(check) != L)
-    {
-        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
-    }
-    cout << endl;
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-
-
-    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
-
-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
-
-
-vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
-{
-    cout << "#################### (D) ####################" << endl;
-    cout << "\nLOOPS = " << NLOOPS << endl;
-    cout << "\nN = " << N << endl;
-    cout << "\np = " << p << endl;
-
-//  Memory allocation
-    cout << "Memory allocation\n";
-
-    vector<double> a(p + 1, 0);
-    vector<double> x(N);
-
-    cout.precision(2);
-    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
-    cout.precision(6);
-
-//  Data initialization
-
-    for (size_t j = 0; j < N; ++j)
-        x[j] = 1.0*j;
-    
-    for (size_t k = 0; k < p + 1; ++k)
-        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
-
-
-
-    cout << "\nStart Benchmarking poly\n";
-
-    auto t1 = system_clock::now(); // start timer
-// Do calculation
-    vector<double> y(N);
-    double check;
-    double check_sum;
-    
-    for (size_t i = 0; i < NLOOPS; ++i)
-    {
-        y = poly_parallel(a, x);
-        check = y[0];
-
-        check_sum += check; // prevents the optimizer from removing unused calculation results.
-    }
-
-    auto t2 = system_clock::now();  // stop timer
-    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
-    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
-    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
-    
-
-
-// Check the correct result
-    cout << "\n poly(" << x[0] << ") = " << check << endl;
-    if (abs(check - 1.0) > 1.0/1e6)
-    {
-        cout << "  !!   W R O N G  result   !!\n";
-    }
-    cout << endl;
-
-
-// Timings  and Performance
-    cout << endl;
-    cout.precision(2);
-    
-
-    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
-    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
-
-    cout << "Total duration : " << t_diff*NLOOPS << endl;
-    cout << "Timing in sec. : " << t_diff << endl;
-    cout << "GFLOPS         : " << Gflops << endl;
-    cout << "GiByte/s       : " << MemBandwidth << endl;
-
-
-
-    return vector<double>{t_diff, Gflops, MemBandwidth};
-}
--- a/ex5/ex5_4/benchmark_tests.h
+++ b/ex5/ex5_4/benchmark_tests.h
@ -1,13 +0,0 @@
-#pragma once
-#include <vector>
-using namespace std;
-
-vector<double> test_A(const size_t &NLOOPS, const size_t &N);
-
-vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
-
-vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
-
-vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
-
-vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex5/ex5_4/benchmarks.cpp
+++ b/ex5/ex5_4/benchmarks.cpp
@ -1,141 +0,0 @@
-#include "benchmarks.h"
-#include <cassert>       // assert()
-#include <cmath>
-#include <iostream>
-#include <vector>
-#include <omp.h>
-
-// (A) Inner product of two vectors (from skalar_stl)
-double scalar_parallel(vector<double> const &x, vector<double> const &y)
-{
-    assert(x.size() == y.size());
-    size_t const N = x.size();
-    double sum = 0.0;
-//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
-#pragma omp parallel for shared(x, y, N) reduction(+:sum)
-    for (size_t i = 0; i < N; ++i)
-    {
-        sum += x[i] * y[i];
-    }
-    return sum;
-}
-
-// (A) Vector entry sum
-double sum(vector<double> const &x)
-{
-    double sum = 0.0;
-#pragma omp parallel for shared(x) reduction(+:sum)
-    for (size_t i = 0; i < x.size(); ++i)
-    {
-        sum += x[i];
-    }
-    return sum;
-}
-
-
-// (B) Matrix-vector product (from intro_vector_densematrix)
-vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
-{
-    size_t const nelem = A.size();
-    size_t const N = x.size();
-    assert(nelem % N == 0); // make sure multiplication is possible
-    size_t const M = nelem/N;          
-
-    vector<double> b(M);    
-
-#pragma omp parallel for shared(A, x, N, M, b)
-    for (size_t i = 0; i < M; ++i)
-    {
-        double tmp = 0.0;
-        for (size_t j = 0; j < N; ++j)
-            tmp += A[N*i + j] * x[j];
-        b[i] = tmp;
-    }
-
-    return b;
-}
-
-
-// (C) Matrix-matrix product
-vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
-{
-    size_t const nelem_A = A.size();
-    size_t const nelem_B = B.size();
-
-    assert(nelem_A % L == 0 && nelem_B % L == 0);
-
-    size_t const M = nelem_A/L;
-    size_t const N = nelem_B/L;
-
-
-    vector<double> C(M*N);
-
-            
-#pragma omp parallel for shared(A, B, M, N, L, C)
-    for (size_t i = 0; i < M; ++i)
-    {
-        for (size_t k = 0; k < L; ++k)
-        {
-            for (size_t j = 0; j < N; ++j)
-            {
-                C[N*i + j] += A[L*i + k]*B[N*k + j];
-            }
-            
-        }
-    }
-
-    return C;
-}
-
-
-// (D) Evaluation of a polynomial function
-vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
-{
-    size_t const N = x.size();
-    size_t const p = a.size() - 1;
-    vector<double> y(N, 0);
-
-#pragma omp parallel for shared(a, x, N, p, y)
-    for (size_t i = 0; i < N; ++i)
-    {
-        double x_temp = x[i];
-        double y_temp = 0;
-        for (size_t k = 0; k < p + 1; ++k)
-        {
-            y_temp += x_temp*y_temp + a[p - k];
-        }
-        y[i] = y_temp;
-    }
-
-    return y;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/ex5/ex5_4/benchmarks.h
+++ b/ex5/ex5_4/benchmarks.h
@ -1,55 +0,0 @@
-#pragma once
-#include <vector>
-using namespace std;
-
-/**     (A) Inner product of two vectors (from skalar_stl)
-	@param[in] x	vector
-	@param[in] y	vector
-	@return 	    resulting Euclidian inner product <x,y>
-*/
-double scalar_parallel(vector<double> const &x, vector<double> const &y);
-
-
-/**     (A) Sum entries of vector
-	@param[in] x	vector
-	@return 	    sum
-*/
-double sum(vector<double> const &x);
-
-
-/** 	(B) Matrix-vector product (from intro_vector_densematrix)
- * 	@param[in] A	dense matrix (1D access)
- *  @param[in] u	vector
- *
- *	@return    resulting vector
-*/
-vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
-
-
-/** 	(C) Matrix-matrix product
- * 	@param[in] A			MxL dense matrix (1D access)
- *  @param[in] B			LxN dense matrix (1D access)
- *  @param[in] shared_dim 	shared dimension L
- *
- *	@return    resulting MxN matrix
-*/
-vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
-
-
-/** 	(D) Evaluation of a polynomial function using Horner's scheme
- * 	@param[in] a	coefficient vector
- *  @param[in] x	vector with input values
- *
- *	@return    vector with output values
-*/
-vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
-
-
-
-
-
-
-
-
-
-
--- a/ex5/ex5_4/main.cpp
+++ b/ex5/ex5_4/main.cpp
@ -1,84 +0,0 @@
-#include "benchmark_tests.h"
-#include <iostream>
-#include <cmath>
-
-int main()
-{   
-    vector<vector<double>> results_scalar;
-    results_scalar.push_back(test_A(2000000, pow(10,3)));
-    results_scalar.push_back(test_A(1000000, pow(10,4)));
-    results_scalar.push_back(test_A(100000, pow(10,5)));
-    results_scalar.push_back(test_A(10000, pow(10,6)));
-    results_scalar.push_back(test_A(750, pow(10,7)));
-    results_scalar.push_back(test_A(125, pow(10,8)));
-
-
-    vector<vector<double>> results_sum;
-    results_sum.push_back(test_A_sum(3000000, pow(10,3)));
-    results_sum.push_back(test_A_sum(2000000, pow(10,4)));
-    results_sum.push_back(test_A_sum(1000000, pow(10,5)));
-    results_sum.push_back(test_A_sum(50000, pow(10,6)));
-    results_sum.push_back(test_A_sum(2000, pow(10,7)));
-    results_sum.push_back(test_A_sum(250, pow(10,8)));
-
-
-    test_B(100, 20000, 10000);
-
-    test_C(25, 500, 1000, 1500);
-
-    test_D(100, 100, 1000000);
-    
-
-
-    cout << endl << "###### Scalar ######" << endl;
-    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
-    cout << "------------------------------" << endl;
-    for (size_t i = 0; i < results_scalar.size(); ++i)
-        cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
-
-    cout << endl << "###### Sum ######" << endl;
-    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
-    cout << "------------------------------" << endl;
-    for (size_t i = 0; i < results_sum.size(); ++i)
-        cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
-
-
-
-    
-    // ###### Scalar ######
-    // Timing  GFLOPS  GiByte/s
-    // ------------------------------
-    // 3.4e-06 0.54    4.3
-    // 4.6e-06 4       32
-    // 1.6e-05 12      95
-    // 0.0011  1.7     13
-    // 0.0097  1.9     15
-    // 0.075   2.5     20
-
-
-    // ###### Sum ######
-    // Timing  GFLOPS  GiByte/s
-    // ------------------------------
-    // 5.5e-06 0.17    1.3
-    // 5.4e-06 1.7     14
-    // 1.5e-05 6.1     49
-    // 0.00013 7.2     57
-    // 0.0033  2.8     23
-    // 0.032   2.9     23
-
-    
-
-    
-    
-    // ######### NOT PARALLEL (from exercise sheet 2) #########
-    //      Timing  GFLOPS  GiByte/s
-    // ----------------------------------
-    // (A)  0.038   2.5     20
-    // (B)  0.13    2.9     23
-    // (C)  0.44    3.2     25
-    // (D)  0.19    1.5     12
-
-
-
-    return 0;
-}
--- a/utils/Doxyfile
+++ b/utils/Doxyfile
--- a/Show more
+++ b/Show more