diff --git a/ex5/ex5_1/Makefile b/ex5/ex5_1/Makefile
new file mode 100644
index 0000000..f5bc097
--- /dev/null
+++ b/ex5/ex5_1/Makefile
@@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
diff --git a/ex5/ex5_1/check_env.h b/ex5/ex5_1/check_env.h
new file mode 100644
index 0000000..41bd99d
--- /dev/null
+++ b/ex5/ex5_1/check_env.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <iostream>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <unordered_map>
+
+//#####################################
+// G.Haase
+// See https://sourceforge.net/p/predef/wiki/Compilers/
+//     http://www.cplusplus.com/doc/tutorial/preprocessor/
+//  also:  export OMP_DISPLAY_ENV=VERBOSE
+//#####################################
+/** 	Checks for compilers, its versions, threads etc.
+ * 
+	@param[in] argc	number of command line arguemnts
+	@param[in] argv	command line arguments as array of C-strings
+*/
+template <class T>
+void check_env(T argc, char const *argv[])
+{
+    std::cout << "\n#######################################################################\n";
+    std::cout << "Code    :";
+    for (T k = 0; k < argc; ++k) std::cout << "  " << argv[k];
+    std::cout << std::endl;
+
+    // compiler:      https://sourceforge.net/p/predef/wiki/Compilers/
+    std::cout <<    "Compiler:  ";
+#if defined __INTEL_COMPILER
+#pragma message(" ##########  INTEL  ###############")
+    std::cout << "INTEL " << __INTEL_COMPILER;
+    // Ignore warnings for #pragma acc   unrecognice
+#pragma warning disable 161
+    // Ignore warnings for #pragma omp   unrecognice
+#pragma warning disable 3180
+
+#elif defined __PGI
+#pragma message(" ##########  PGI    ###############")
+    std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
+#elif defined  __clang__
+#pragma message(" ##########  CLANG    ###############")
+    std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
+#elif defined __GNUC__
+#pragma message(" ##########  Gnu    ###############")
+    std::cout << "Gnu " <<  __GNUC__  << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
+#else
+#pragma message(" ##########  unknown Compiler   ###############")
+    std::cout << "unknown";
+#endif
+    std::cout << "  C++ standard: " << __cplusplus << std::endl;
+
+    // Parallel environments
+    std::cout <<    "Parallel:  ";
+#if defined MPI_VERSION
+#pragma message(" ##########  MPI    ###############")
+#ifdef OPEN_MPI
+    std::cout << "OpenMPI ";
+#else
+    std::cout << "MPI ";
+#endif
+    std::cout << MPI_VERSION << "." << MPI_SUBVERSION << "   ";
+#endif
+
+#ifdef _OPENMP
+//https://www.openmp.org/specifications/
+//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
+    std::unordered_map<unsigned, std::string> const map{
+        {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
+#pragma message(" ##########  OPENMP    ###############")
+    //std::cout << _OPENMP;
+    std::cout << "OpenMP ";
+    try {
+        std::cout << map.at(_OPENMP);
+    }
+    catch (...) {
+        std::cout << _OPENMP;
+    }
+    #pragma omp parallel
+    {
+        #pragma omp master
+        {
+            const int nn = omp_get_num_threads();          // OpenMP
+            std::cout << " ---> " <<  nn << " Threads   ";
+        }
+        #pragma omp barrier
+    }
+
+#endif
+#ifdef _OPENACC
+#pragma message(" ##########  OPENACC    ###############")
+    std::cout << "OpenACC   ";
+#endif
+    std::cout << std::endl;
+    std::cout << "Date    :  " << __DATE__ << "  " << __TIME__;
+    std::cout << "\n#######################################################################\n";
+}
+// HG
+
diff --git a/ex5/ex5_1/main.cpp b/ex5/ex5_1/main.cpp
new file mode 100644
index 0000000..c261c29
--- /dev/null
+++ b/ex5/ex5_1/main.cpp
@@ -0,0 +1,142 @@
+#include "check_env.h"
+#include "mylib.h"
+#include <cstdlib>          // atoi()
+#include <cstring>          // strncmp()
+#include <ctime>
+#include <iostream>
+#include <omp.h>            // OpenMP
+#include <sstream>
+#include <string>
+using namespace std;
+
+int main(int argc, char const *argv[])
+{
+    omp_set_schedule(omp_sched_static, 2000000);
+    //omp_set_schedule(omp_sched_dynamic, 1000000);
+    //omp_set_schedule(omp_sched_guided, 1000000);
+    //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
+
+
+    // Speedup for different number of cores (incl. hyperthreading)
+    omp_set_num_threads(8);
+
+    // Print number of available processors
+    cout << "Number of available processors: " << omp_get_num_procs() << endl;
+
+    // Currently executing parallel code? -> no
+    cout << "Currently in parallel? " << omp_in_parallel() << endl;
+
+
+    int const NLOOPS = 10;        // chose a value such that the benchmark runs at least 10 sec.
+    unsigned int N = 500000001;
+
+
+//##########################################################################
+//   Read Parameter from command line  (C++ style)
+    cout << "Checking command line parameters for: -n <number> " << endl;
+    for (int i = 1; i < argc; i++)
+    {
+        cout << " arg[" << i << "] = " << argv[i] << endl;
+        string ss(argv[i]);
+        if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
+        {
+            N = static_cast<unsigned int>(atoi(argv[i + 1]));
+        }
+        else
+        {
+            cout << "Corect call: " << argv[0] << " -n  <number>\n";
+        }
+    }
+
+    cout << "\nN = " << N << endl;
+    
+    check_env(argc, argv);
+//########################################################################
+    int nthreads;                                  // OpenMP
+    #pragma omp parallel default(none) shared(cout,nthreads)
+    {
+        stringstream inparallel;
+        inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
+
+
+        int const th_id  = omp_get_thread_num();   // OpenMP
+        int const nthrds = omp_get_num_threads();  // OpenMP
+        stringstream ss;
+        ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
+        #pragma omp critical
+        {
+            cout << ss.str();                      // output to a shared ressource
+            cout << inparallel.str() << endl;
+        }
+        #pragma omp master
+        nthreads = nthrds;                         // transfer nn to to master thread
+    }
+    cout << "   " << nthreads << "   threads have been started." << endl;
+
+//##########################################################################
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//##########################################################################
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        x[i] = i + 1;
+        y[i] = 1.0 / x[i];
+    }
+
+//##########################################################################
+    cout << "\nStart Benchmarking\n";
+
+// Do calculation
+    double tstart = omp_get_wtime();                  // OpenMP
+
+    double sk(0.0);
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        //sk = scalar(x, y);
+        sk = scalar_parallel(x, y);
+        //sk = scalar_trans(x, y);
+        //sk = norm(x);
+    }
+
+    double t1 = omp_get_wtime() - tstart;             // OpenMP
+
+    t1 /= NLOOPS;           // divide by number of function calls
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "Total benchmarking time: " << t1*NLOOPS << endl;
+    cout << "Timing in sec. : " << t1 << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+
+//#########################################################################
+
+    cout << "\n  Try the reduction with an STL-vektor \n";
+    
+    auto vr = reduction_vec_append(5);
+    cout << "done\n";
+    cout << vr << endl;
+
+
+    return 0;
+}  // memory for x and y will be deallocated their destructors
diff --git a/ex5/ex5_1/mylib.cpp b/ex5/ex5_1/mylib.cpp
new file mode 100644
index 0000000..b5f2697
--- /dev/null
+++ b/ex5/ex5_1/mylib.cpp
@@ -0,0 +1,137 @@
+#include "mylib.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <iostream>
+#include <functional>    // multiplies<>{}
+#include <list>
+#include <numeric>       // iota()
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <vector>
+using namespace std;
+
+
+double scalar_parallel(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
+    {
+        const size_t nthreads = omp_get_num_threads();
+        const size_t threadnum = omp_get_thread_num();
+        const size_t chunksize = N/nthreads;
+
+
+        size_t start = threadnum*chunksize;
+        size_t end = start + chunksize;
+        if (threadnum == nthreads - 1)
+            end = N;
+
+
+        for (size_t i = start; i < end; ++i)
+        {
+            sum += x[i] * y[i];
+        }
+        
+    }
+    return sum;
+}
+
+
+vector<int> reduction_vec_append(int n)
+{ 
+    vector<int> vec(n);
+#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
+    {
+        #pragma omp barrier
+        #pragma omp critical
+        cout << omp_get_thread_num() << " : " << vec.size() << endl;
+        #pragma omp barrier
+        iota( vec.begin(),vec.end(), omp_get_thread_num() );
+        #pragma omp barrier
+
+    }
+    return vec;
+}
+
+
+
+
+
+
+
+
+
+
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double norm(vector<double> const &x)
+{
+    size_t const N = x.size();
+    double sum = 0.0;
+#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i]*x[i];
+    }
+    return sum;
+}
+
+
+vector<int> reduction_vec(int n)
+{ 
+    vector<int> vec(n);
+#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
+    {
+        #pragma omp barrier
+        #pragma omp critical
+        cout << omp_get_thread_num() << " : " << vec.size() << endl;
+        #pragma omp barrier
+        iota( vec.begin(),vec.end(), omp_get_thread_num() );
+        #pragma omp barrier
+
+    }
+    return vec;
+}
+
+
+double scalar_trans(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    vector<double> z(x.size());
+    //list<double> z(x.size());  // parallel for-loop on iterators not possible (missing 'operator-')
+                                 // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison 
+    
+    transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
+    
+    double sum = 0.0;
+#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
+    for (auto pi = cbegin(z); pi!=cend(z); ++pi)
+    {
+        sum += *pi;
+    }
+    //for (auto val: z)
+    //{
+        //sum += val;
+    //}    
+    return sum;
+}
+
+
+
diff --git a/ex5/ex5_1/mylib.h b/ex5/ex5_1/mylib.h
new file mode 100644
index 0000000..fe1948a
--- /dev/null
+++ b/ex5/ex5_1/mylib.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <cassert>
+#include <iomanip>             // setw()
+#include <iostream>
+#include <omp.h>
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
+
+
+
+// Declare additional reduction operation in OpenMP for STL-vector
+#pragma omp declare reduction(VecAppend : std::vector<int>  : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
+ initializer (omp_priv=omp_orig)
+ 
+std::vector<int> reduction_vec_append(int n);
+
+
+/** 	l2-norm
+	@param[in] x	vector
+	@return 	    resulting Euclidian norm
+*/
+double norm(std::vector<double> const &x);
+
+/** 	 Vector @p b adds its elements to vector @p a .
+	@param[in] a	vector
+	@param[in] b	vector
+	@return 	     a+=b componentwise
+*/
+template<class T>
+std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
+{
+    assert(a.size()==b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+        a[k] += b[k];
+    }
+    return a;
+}
+
+// Declare the reduction operation in OpenMP for an STL-vector
+//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
+// ------------------------------------------------------------
+// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
+// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
+// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
+//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
+#pragma omp declare reduction(VecAdd : std::vector<int>  : omp_out += omp_in) \
+ initializer (omp_priv=omp_orig)
+
+//   Templates are  n o t  possible, i.e. the reduction has to be declared fore a specified type.
+//template <class T>
+//#pragma omp declare reduction(VecAdd : std::vector<T>  : omp_out += omp_in) initializer (omp_priv(omp_orig))
+// MS: template nach #pragma  !?
+
+// ------------------------------------------------------------
+
+/** 	Test for vector reduction.
+ * 
+ * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
+ * Afterwards these vectors are accumulated, i.e., 
+ * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
+ * 
+ * 	@param[in] n  size of global/private vector
+ * 	@return  resulting global vector.
+*/
+std::vector<int> reduction_vec(int n);
+
+
+
+/** 	Output of a vector.
+	@param[in,out] s	output stream
+	@param[in]     x	vector
+	@return 	   modified output stream
+*/
+template <class T>
+std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
+{
+    for (auto const &v : x)  s << std::setw(4) << v << "  ";
+    return s;
+}
+
diff --git a/ex5/ex5_1/timing.h b/ex5/ex5_1/timing.h
new file mode 100644
index 0000000..11d8da2
--- /dev/null
+++ b/ex5/ex5_1/timing.h
@@ -0,0 +1,70 @@
+#pragma once
+#include <chrono>                  // timing
+#include <stack>
+
+using Clock = std::chrono::system_clock;   //!< The wall clock timer chosen
+//using Clock = std::chrono::high_resolution_clock;
+using TPoint= std::chrono::time_point<Clock>;
+
+// [Galowicz, C++17 STL Cookbook, p. 29]
+inline 
+std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
+
+/** Starts stopwatch timer.
+ *  Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * 
+ *  The timining is allowed to be nested and the recent time is stored on top of the stack.
+ * 
+ *  @return recent time
+ *  @see toc
+ */
+inline auto tic()
+{
+    MyStopWatch.push(Clock::now());
+    return MyStopWatch.top();
+}
+ 
+/** Returns the elapsed time from stopwatch. 
+ * 
+ * The time from top of the stack is used
+ * if time point @p t_b is not passed as input parameter.
+ * Use as @code tic(); myfunction(...) ; double tsec = toc();  @endcode
+ * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b);  @endcode
+ * The last option is to be used in the case of 
+ * non-nested but overlapping time measurements.
+ * 
+ * @param[in]  t_b start time of some stop watch
+ * @return elapsed time in seconds.
+ *
+*/ 
+inline double toc(TPoint const &t_b = MyStopWatch.top())
+{
+    // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
+    using Unit      = std::chrono::seconds;
+    using FpSeconds = std::chrono::duration<double, Unit::period>;        
+    auto t_e = Clock::now();
+    MyStopWatch.pop();
+    return FpSeconds(t_e-t_b).count();
+}
+
+#include <iostream>
+#include <string>
+/** Executes function @p f and measures/prints elapsed wall clock time in seconds
+ * 
+ * Call as
+ * @code measure("Time for (b = b + 1)", [&]() {
+        thrust::transform(b.begin(), b.end(), b.begin(), increment());
+    }); @endcode
+ * 
+ * @param[in]  label  additional string to be printed with the measurement.
+ * @param[in]  f      function to execute.
+ * @author Therese Bösmüller, 2025
+ *
+*/ 
+auto measure = [](const std::string& label, auto&& f) {
+        auto start    = std::chrono::high_resolution_clock::now();
+        f();
+        auto stop     = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+        std::cout << label << ": " << duration << " microseconds" << std::endl;
+};          // ';' is needed for a visible documentation of this lambda-function
diff --git a/ex5/ex5_2/Makefile b/ex5/ex5_2/Makefile
new file mode 100644
index 0000000..7363b62
--- /dev/null
+++ b/ex5/ex5_2/Makefile
@@ -0,0 +1,31 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+
+include ../${COMPILER}default.mk
diff --git a/ex5/ex5_2/data_1.txt b/ex5/ex5_2/data_1.txt
new file mode 100644
index 0000000..0cc4bb8
--- /dev/null
+++ b/ex5/ex5_2/data_1.txt
@@ -0,0 +1,501 @@
+141
+261
+87
+430
+258
+298
+425
+120
+496
+707
+244
+786
+75
+394
+4
+221
+2
+190
+143
+269
+175
+139
+599
+902
+940
+222
+483
+377
+524
+265
+69
+437
+174
+27
+955
+431
+962
+763
+8
+681
+706
+646
+553
+219
+773
+229
+371
+891
+857
+403
+319
+609
+911
+910
+592
+333
+854
+443
+905
+34
+533
+717
+180
+337
+188
+322
+404
+549
+49
+553
+275
+242
+244
+155
+957
+936
+819
+729
+176
+361
+189
+2
+317
+700
+626
+544
+440
+288
+502
+762
+763
+577
+748
+646
+124
+505
+348
+93
+148
+199
+673
+432
+695
+257
+10
+533
+280
+947
+907
+393
+25
+672
+838
+972
+57
+451
+583
+687
+720
+651
+727
+374
+582
+117
+58
+980
+285
+595
+963
+186
+194
+342
+933
+391
+274
+152
+398
+375
+132
+436
+92
+615
+11
+574
+790
+236
+449
+570
+62
+497
+643
+222
+838
+972
+847
+506
+279
+747
+237
+958
+621
+601
+173
+91
+256
+859
+912
+700
+726
+230
+577
+811
+404
+989
+90
+321
+512
+61
+726
+557
+530
+830
+859
+790
+318
+453
+753
+110
+110
+270
+525
+973
+711
+312
+292
+851
+912
+640
+256
+89
+839
+585
+949
+62
+585
+286
+828
+191
+443
+394
+827
+677
+208
+319
+134
+672
+571
+170
+148
+477
+909
+553
+33
+54
+806
+452
+383
+790
+365
+533
+712
+872
+329
+651
+975
+76
+588
+414
+310
+264
+759
+996
+187
+782
+196
+993
+803
+425
+729
+499
+809
+357
+74
+591
+911
+194
+433
+750
+40
+947
+764
+559
+184
+498
+518
+995
+855
+963
+679
+404
+935
+480
+232
+397
+706
+559
+757
+996
+963
+536
+964
+116
+52
+305
+581
+531
+902
+541
+432
+543
+713
+17
+801
+143
+479
+257
+370
+662
+170
+279
+199
+196
+327
+881
+472
+404
+180
+969
+408
+845
+616
+377
+878
+785
+465
+814
+899
+430
+335
+597
+902
+703
+378
+735
+955
+543
+541
+312
+72
+182
+93
+464
+10
+916
+643
+2
+31
+209
+455
+128
+9
+728
+355
+781
+437
+437
+50
+50
+92
+595
+242
+842
+858
+964
+489
+221
+227
+537
+763
+348
+462
+640
+918
+162
+716
+578
+434
+885
+394
+179
+634
+625
+328
+803
+1000
+981
+128
+233
+24
+608
+111
+408
+885
+549
+370
+209
+441
+957
+125
+471
+857
+44
+692
+979
+284
+134
+686
+910
+611
+900
+194
+755
+347
+419
+156
+820
+625
+739
+806
+68
+951
+498
+756
+743
+832
+157
+458
+619
+933
+836
+896
+583
+583
+855
+35
+886
+408
+37
+747
+155
+144
+606
+255
+325
+402
+407
+387
+610
+167
+189
+95
+324
+770
+235
+741
+693
+825
+828
+294
+310
+524
+326
+832
+811
+557
+263
+681
+234
+457
+385
+539
+992
+756
+981
+235
+529
+52
+757
+602
+858
+989
+930
+410
+1
+541
+208
+220
+326
+96
+748
+749
+544
+339
+833
+553
+958
+893
+357
+547
+347
+623
+797
+746
+126
+823
+26
+415
+732
+782
+368
+
diff --git a/ex5/ex5_2/main.GCC_ b/ex5/ex5_2/main.GCC_
new file mode 100755
index 0000000..ef16f39
Binary files /dev/null and b/ex5/ex5_2/main.GCC_ differ
diff --git a/ex5/ex5_2/main.cpp b/ex5/ex5_2/main.cpp
new file mode 100644
index 0000000..1bea099
--- /dev/null
+++ b/ex5/ex5_2/main.cpp
@@ -0,0 +1,130 @@
+#include "mylib.h"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <vector>
+using namespace std;
+
+
+int main()
+{
+    // read vector from file
+    vector<size_t> data_vector = {};
+
+    ifstream input_stream("data_1.txt"); 
+
+    size_t line;
+    while(input_stream >> line)
+    {
+        data_vector.push_back(line);
+    }
+    data_vector.shrink_to_fit();
+
+
+
+
+    // specify loops
+    size_t NLOOPS = 10000;
+
+
+
+    // ############# Parallelization with openMP #############
+    // calculate arithmetic mean, geometric mean and harmonic mean
+    double am_omp, gm_omp, hm_omp;
+
+    double tstart = omp_get_wtime();
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+        means_omp(data_vector, am_omp, gm_omp, hm_omp);
+
+    double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
+
+    // calculate minimum and maximum
+    size_t min, max;
+
+    tstart = omp_get_wtime();
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+        minmax_omp(data_vector, min, max);
+
+    double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
+
+
+
+    
+
+
+    // ############# Parallelization with C++ algorithms #############
+    // calculate arithmetic mean, geometric mean and harmonic mean
+    double am_cpp, gm_cpp, hm_cpp;
+
+    tstart = omp_get_wtime();
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+        means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
+
+    double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
+
+    // calculate minimum and maximum
+    size_t min_cpp, max_cpp;
+
+    tstart = omp_get_wtime();
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+        minmax_cpp(data_vector, min_cpp, max_cpp);
+
+    double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
+
+
+
+
+
+    // print results
+    cout << "####### OpenMP #######" << endl;
+    cout << "minimum: " << min << endl;
+    cout << "maximum: " << max << endl;
+    cout << "duration: " << t_minmax_omp << endl << endl;
+
+    cout << "arithmetic mean: " << am_omp << endl;
+    cout << "geometric mean: " << gm_omp << endl;
+    cout << "harmonic mean: " << hm_omp << endl;
+    cout << "duration: " << t_means_omp << endl << endl;
+
+
+    cout << "####### C++ #######" << endl;
+    cout << "minimum: " << min_cpp << endl;
+    cout << "maximum: " << max_cpp << endl;
+    cout << "duration: " << t_minmax_cpp << endl << endl;
+
+    cout << "arithmetic mean: " << am_cpp << endl;
+    cout << "geometric mean: " << gm_cpp << endl;
+    cout << "harmonic mean: " << hm_cpp << endl;
+    cout << "duration: " << t_means_cpp << endl << endl;
+
+
+
+    // ####### OpenMP #######
+    // minimum: 1
+    // maximum: 1000
+    // duration: 3.52086e-06
+
+    // arithmetic mean: 498.184
+    // geometric mean: 364.412
+    // harmonic mean: 95.6857
+    // duration: 5.90171e-06
+
+    // ####### C++ #######
+    // minimum: 1
+    // maximum: 1000
+    // duration: 1.76816e-05
+
+    // arithmetic mean: 498.184
+    // geometric mean: 364.412
+    // harmonic mean: 95.6857
+    // duration: 2.35728e-05
+
+    // --> the openMP variant is faster in both cases
+
+
+    return 0;
+}
diff --git a/ex5/ex5_2/main.o b/ex5/ex5_2/main.o
new file mode 100644
index 0000000..0a6cbf3
Binary files /dev/null and b/ex5/ex5_2/main.o differ
diff --git a/ex5/ex5_2/mylib.cpp b/ex5/ex5_2/mylib.cpp
new file mode 100644
index 0000000..34e5992
--- /dev/null
+++ b/ex5/ex5_2/mylib.cpp
@@ -0,0 +1,103 @@
+#include "mylib.h"
+#include <algorithm>
+#include <cmath>
+#include <execution>
+#include <iostream>
+#include <numeric>
+#include <omp.h>
+
+using namespace std;
+
+
+void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
+{
+    size_t const n = numbers.size();
+
+    am = 0.;
+    gm = 0.;
+    hm = 0.;
+
+#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
+    for (size_t i = 0; i < n; ++i)
+    {
+        am += numbers[i];
+        gm += log(numbers[i]);
+        hm += 1.0/numbers[i];
+
+        // #pragma omp critical
+        // {
+        //     cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
+        // }
+    }
+
+    am /= n;
+    gm = exp(gm/n);
+    hm = n/hm;
+}
+
+
+void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
+{
+    size_t const n = numbers.size();
+
+    global_min = -1; // gives the maximum size_t value
+    global_max = 0;
+
+#pragma omp parallel shared(numbers, n, global_min, global_max)
+    {
+        const size_t nthreads = omp_get_num_threads();
+        const size_t threadnum = omp_get_thread_num();
+        const size_t chunksize = n/nthreads;
+
+
+        size_t start = threadnum*chunksize;
+        size_t end = start + chunksize;
+        if (threadnum == nthreads - 1)
+            end = n;
+
+
+        size_t local_min = -1;
+        size_t local_max = 0;
+        for (size_t i = start; i < end ; ++i)
+        {
+            if (numbers[i] < local_min)
+                local_min = numbers[i];
+
+            if (numbers[i] > local_max)
+                local_max = numbers[i];
+        }
+
+        #pragma omp critical
+        {
+            if (local_min < global_min)
+                global_min = local_min;
+            
+            if (local_max > global_max)
+                global_max = local_max;
+        }
+    }
+}
+
+
+void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
+{
+    size_t const n = numbers.size();
+
+    am = reduce(std::execution::par, numbers.begin(), numbers.end());
+    gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
+    hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
+
+    am /= n;
+    gm = exp(gm/n);
+    hm = n/hm;
+}
+
+
+void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
+{
+    auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
+    auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
+    
+    global_min = *min_it;
+    global_max = *max_it;
+}
\ No newline at end of file
diff --git a/ex5/ex5_2/mylib.h b/ex5/ex5_2/mylib.h
new file mode 100644
index 0000000..01d2fa9
--- /dev/null
+++ b/ex5/ex5_2/mylib.h
@@ -0,0 +1,42 @@
+#include <vector>
+
+/**
+  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
+  Uses openMP parallelization.
+  @param[in]    numbers      vector containing integers
+  @param[out]   am           arithmetic mean
+  @param[out]   gm           geometric mean
+  @param[out]   hm           harmonic mean
+*/
+void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
+
+
+/**
+  This function calculates the minimum and maximum of a vector.
+  Uses openMP parallelization.
+  @param[in]    numbers         vector containing integers
+  @param[out]   global_min      minimum
+  @param[out]   global_max      maximum
+*/
+void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
+
+
+/**
+  This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
+  Uses C++ parallelization.
+  @param[in]    numbers      vector containing integers
+  @param[out]   am           arithmetic mean
+  @param[out]   gm           geometric mean
+  @param[out]   hm           harmonic mean
+*/
+void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
+
+
+/**
+  This function calculates the minimum and maximum of a vector.
+  Uses C++ parallelization.
+  @param[in]    numbers         vector containing integers
+  @param[out]   global_min      minimum
+  @param[out]   global_max      maximum
+*/
+void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
\ No newline at end of file
diff --git a/ex5/ex5_2/mylib.o b/ex5/ex5_2/mylib.o
new file mode 100644
index 0000000..7b34810
Binary files /dev/null and b/ex5/ex5_2/mylib.o differ
diff --git a/ex5/ex5_3/Makefile b/ex5/ex5_3/Makefile
new file mode 100644
index 0000000..4a714a3
--- /dev/null
+++ b/ex5/ex5_3/Makefile
@@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp goldbach.cpp 
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
diff --git a/ex5/ex5_3/goldbach.cpp b/ex5/ex5_3/goldbach.cpp
new file mode 100644
index 0000000..55709c3
--- /dev/null
+++ b/ex5/ex5_3/goldbach.cpp
@@ -0,0 +1,46 @@
+#include "goldbach.h"
+#include <iostream>
+#include <iterator>
+#include <omp.h>
+
+size_t single_goldbach(size_t k)
+{
+    const std::vector<size_t> relevant_primes = get_primes(k);
+    size_t m = relevant_primes.size();
+
+    size_t counter = 0;
+
+#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
+    for(size_t i = 0; i < m; ++i)
+    {
+        for(size_t j = i; j < m; ++j)
+        {
+            if(relevant_primes[i] + relevant_primes[j] == k)
+                ++counter;
+        }
+    }
+
+    return counter;
+}
+
+
+std::vector<size_t> count_goldbach(size_t n)
+{
+    const std::vector<size_t> relevant_primes = get_primes(n);
+    size_t m = relevant_primes.size();
+
+    std::vector<size_t> counter_vector(n + 1, 0);
+
+#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
+    for(size_t i = 0; i < m; ++i)
+    {
+        for(size_t j = i; j < m; ++j)
+        {
+            size_t sum = relevant_primes[i] + relevant_primes[j];
+            if(sum <= n)
+                ++counter_vector[relevant_primes[i] + relevant_primes[j]];
+        }
+    }
+
+    return counter_vector;
+}
diff --git a/ex5/ex5_3/goldbach.h b/ex5/ex5_3/goldbach.h
new file mode 100644
index 0000000..15df6d4
--- /dev/null
+++ b/ex5/ex5_3/goldbach.h
@@ -0,0 +1,45 @@
+#pragma once
+#include "mayer_primes.h"
+#include <cassert>
+#include <vector>
+
+
+/**
+  This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
+  @param[in]    k            first integer
+  @param[out]   count        number of decompositions
+*/
+size_t single_goldbach(size_t k);
+
+
+/**
+  This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
+  @param[in]    n                   upper integer bound
+  @param[out]   count_vector        vector containing the number of decompositions for a natural number the corresponding index
+*/
+std::vector<size_t> count_goldbach(size_t n);
+
+
+/** 	 Vector @p b adds its elements to vector @p a .
+	@param[in] a	vector
+	@param[in] b	vector
+	@return 	     a+=b componentwise
+*/
+template<class T>
+std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
+{
+    assert(a.size()==b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+        a[k] += b[k];
+    }
+    return a;
+}
+
+// Declare the reduction operation in OpenMP for an STL-vector
+//   omp_out += omp_in  requires operator+=(vector<int> &, vector<int> const &) from above
+// ------------------------------------------------------------
+// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
+// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
+// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
+//  see also p.74ff in  https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
+#pragma omp declare reduction(VecAdd : std::vector<size_t>  : omp_out += omp_in) initializer (omp_priv=omp_orig)
\ No newline at end of file
diff --git a/ex5/ex5_3/main.cpp b/ex5/ex5_3/main.cpp
new file mode 100644
index 0000000..1be01c2
--- /dev/null
+++ b/ex5/ex5_3/main.cpp
@@ -0,0 +1,45 @@
+#include "goldbach.h"
+#include <algorithm>
+#include <iostream>
+#include <omp.h>
+using namespace std;
+
+
+int main()
+{
+    cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
+
+    for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
+    {
+        double t_start = omp_get_wtime();
+
+        auto goldbach_vector = count_goldbach(n);
+
+        auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
+        size_t max_number = distance(goldbach_vector.begin(), max_it);
+
+        double t_end = omp_get_wtime() - t_start;
+
+        cout << "The number " <<  max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
+    }
+
+    /*
+    ###### WITHOUT PARALLELIZATION ######
+    The number 9240 has 329 decompositions. Duration: 0.00307696
+    The number 99330 has 2168 decompositions. Duration: 0.189839
+    The number 390390 has 7094 decompositions. Duration: 1.3042
+    The number 990990 has 15594 decompositions. Duration: 5.45034
+    The number 1981980 has 27988 decompositions. Duration: 47.1807
+
+
+    ###### WITH PARALLELIZATION ######
+    The number 9240 has 329 decompositions. Duration: 0.000734854
+    The number 99330 has 2168 decompositions. Duration: 0.0251322
+    The number 390390 has 7094 decompositions. Duration: 0.487375
+    The number 990990 has 15594 decompositions. Duration: 6.16972
+    The number 1981980 has 27988 decompositions. Duration: 31.5699
+    */
+
+
+    return 0;
+}
\ No newline at end of file
diff --git a/ex5/ex5_3/mayer_primes.h b/ex5/ex5_3/mayer_primes.h
new file mode 100644
index 0000000..6c6dc23
--- /dev/null
+++ b/ex5/ex5_3/mayer_primes.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstring> //memset
+#include <vector>
+//using namespace std;
+
+/** \brief Determines all prime numbers in interval [2, @p max].
+ *
+ *  The sieve of Eratosthenes is used.
+ *
+ *  The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
+ *
+ * \param[in]   max end of interval for the prime number search.
+ * \return  vector of prime numbers @f$2,3,5, ..., p<=max @f$.
+ *
+ * \copyright
+ * Copyright (c) 2008 Florian Mayer          (adapted by Gundolf Haase 2018)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+template <class T>
+std::vector<T> get_primes(T max)
+{
+    std::vector<T> primes;
+    char *sieve;
+    sieve = new char[max / 8 + 1];
+    // Fill sieve with 1
+    memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
+    for (T x = 2; x <= max; x++)
+    {
+        if (sieve[x / 8] & (0x01 << (x % 8))) {
+            primes.push_back(x);
+            // Is prime. Mark multiplicates.
+            for (T j = 2 * x; j <= max; j += x)
+            {
+                sieve[j / 8] &= ~(0x01 << (j % 8));
+			}
+        }
+	}
+    delete[] sieve;
+    return primes;
+}
+
+//---------------------------------------------------------------
+//int main()        // by  Florian Mayer
+//{g++ -O3  -std=c++14 -fopenmp main.cpp && ./a.out
+//    vector<unsigned long> primes;
+//    primes = get_primes(10000000);
+//    // return 0;
+//    // Print out result.
+//    vector<unsigned long>::iterator it;
+//    for(it=primes.begin(); it < primes.end(); it++)
+//        cout << *it << " ";
+//
+//    cout << endl;
+//    return 0;
+//}
diff --git a/ex5/ex5_4/Makefile b/ex5/ex5_4/Makefile
new file mode 100644
index 0000000..b2129d8
--- /dev/null
+++ b/ex5/ex5_4/Makefile
@@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
diff --git a/ex5/ex5_4/benchmark_tests.cpp b/ex5/ex5_4/benchmark_tests.cpp
new file mode 100644
index 0000000..35ed87b
--- /dev/null
+++ b/ex5/ex5_4/benchmark_tests.cpp
@@ -0,0 +1,375 @@
+#include "benchmark_tests.h"
+#include "benchmarks.h"
+#include <chrono>
+#include <iostream>
+#include <math.h>       
+using namespace std::chrono; 
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N)
+{
+    cout << "#################### (A) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        x[i] = i % 219 + 1;
+        y[i] = 1.0/x[i];
+    }
+
+
+    cout << "\nStart Benchmarking scalar\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double check(0.0),ss(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        check = scalar_parallel(x, y);
+        ss += check;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n <x,y> = " << check << endl;
+    if (static_cast<unsigned int>(check) != N)
+        cout << "  !!   W R O N G  result   !!\n";
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
+{
+    cout << "#################### (A) sum ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1;
+    }
+
+
+    cout << "\nStart Benchmarking sum\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double check(0.0),ss(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        check = sum(x);
+        ss += check;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n <x,y> = " << check << endl;
+    if (static_cast<unsigned int>(check) != N)
+        cout << "  !!   W R O N G  result   !!\n";
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
+{
+    cout << "#################### (B) ####################" << endl;
+
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\nM = " << M << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*N);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[N*i + j] = (i + j) % 219 + 1;
+
+
+    for (size_t j = 0; j < N; ++j)
+    {
+        x[j] = 1.0/A[N*17 + j];
+    }
+
+    cout << "\nStart Benchmarking MatVec\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> b(M);
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        b = MatVec_parallel(A, x);
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n <A[17,*],x> = " << b[17] << endl;
+    if (static_cast<size_t>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
+{
+    cout << "#################### (C) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nL = " << L << endl;
+    cout << "\nM = " << M << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*L);
+    vector<double> B(L*N);
+
+    cout.precision(2);
+    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t k = 0; k < L; ++k)
+            A[L*i + k] = (i + k) % 219 + 1;
+
+    for (size_t k = 0; k < L; ++k)
+        for (size_t j = 0; j < N; ++j)
+            B[N*k + j] = 1.0/A[L*17 + k];
+
+
+    cout << "\nStart Benchmarking MatMat\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> C(M*N);
+    double check;
+    double check_sum = 0;
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        C = MatMat_parallel(A, B, L);
+
+        check = C[N*17];
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+    cout << check_sum;
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n C[17,0] = " << check << endl;
+    if (static_cast<unsigned int>(check) != L)
+    {
+        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
+    }
+    cout << endl;
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
+{
+    cout << "#################### (D) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\np = " << p << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> a(p + 1, 0);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t j = 0; j < N; ++j)
+        x[j] = 1.0*j;
+    
+    for (size_t k = 0; k < p + 1; ++k)
+        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
+
+
+
+    cout << "\nStart Benchmarking poly\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> y(N);
+    double check;
+    double check_sum;
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        y = poly_parallel(a, x);
+        check = y[0];
+
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n poly(" << x[0] << ") = " << check << endl;
+    if (abs(check - 1.0) > 1.0/1e6)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    
+
+    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
\ No newline at end of file
diff --git a/ex5/ex5_4/benchmark_tests.h b/ex5/ex5_4/benchmark_tests.h
new file mode 100644
index 0000000..45c64a9
--- /dev/null
+++ b/ex5/ex5_4/benchmark_tests.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N);
+
+vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
\ No newline at end of file
diff --git a/ex5/ex5_4/benchmarks.cpp b/ex5/ex5_4/benchmarks.cpp
new file mode 100644
index 0000000..c0eb7d9
--- /dev/null
+++ b/ex5/ex5_4/benchmarks.cpp
@@ -0,0 +1,141 @@
+#include "benchmarks.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <omp.h>
+
+// (A) Inner product of two vectors (from skalar_stl)
+double scalar_parallel(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
+#pragma omp parallel for shared(x, y, N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+// (A) Vector entry sum
+double sum(vector<double> const &x)
+{
+    double sum = 0.0;
+#pragma omp parallel for shared(x) reduction(+:sum)
+    for (size_t i = 0; i < x.size(); ++i)
+    {
+        sum += x[i];
+    }
+    return sum;
+}
+
+
+// (B) Matrix-vector product (from intro_vector_densematrix)
+vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;          
+
+    vector<double> b(M);    
+
+#pragma omp parallel for shared(A, x, N, M, b)
+    for (size_t i = 0; i < M; ++i)
+    {
+        double tmp = 0.0;
+        for (size_t j = 0; j < N; ++j)
+            tmp += A[N*i + j] * x[j];
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+
+// (C) Matrix-matrix product
+vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+            
+#pragma omp parallel for shared(A, B, M, N, L, C)
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t k = 0; k < L; ++k)
+        {
+            for (size_t j = 0; j < N; ++j)
+            {
+                C[N*i + j] += A[L*i + k]*B[N*k + j];
+            }
+            
+        }
+    }
+
+    return C;
+}
+
+
+// (D) Evaluation of a polynomial function
+vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
+{
+    size_t const N = x.size();
+    size_t const p = a.size() - 1;
+    vector<double> y(N, 0);
+
+#pragma omp parallel for shared(a, x, N, p, y)
+    for (size_t i = 0; i < N; ++i)
+    {
+        double x_temp = x[i];
+        double y_temp = 0;
+        for (size_t k = 0; k < p + 1; ++k)
+        {
+            y_temp += x_temp*y_temp + a[p - k];
+        }
+        y[i] = y_temp;
+    }
+
+    return y;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ex5/ex5_4/benchmarks.h b/ex5/ex5_4/benchmarks.h
new file mode 100644
index 0000000..2b55d2b
--- /dev/null
+++ b/ex5/ex5_4/benchmarks.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+/**     (A) Inner product of two vectors (from skalar_stl)
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_parallel(vector<double> const &x, vector<double> const &y);
+
+
+/**     (A) Sum entries of vector
+	@param[in] x	vector
+	@return 	    sum
+*/
+double sum(vector<double> const &x);
+
+
+/** 	(B) Matrix-vector product (from intro_vector_densematrix)
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
+
+
+/** 	(C) Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+
+/** 	(D) Evaluation of a polynomial function using Horner's scheme
+ * 	@param[in] a	coefficient vector
+ *  @param[in] x	vector with input values
+ *
+ *	@return    vector with output values
+*/
+vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
+
+
+
+
+
+
+
+
+
+
diff --git a/ex5/ex5_4/main.cpp b/ex5/ex5_4/main.cpp
new file mode 100644
index 0000000..d8fa63d
--- /dev/null
+++ b/ex5/ex5_4/main.cpp
@@ -0,0 +1,84 @@
+#include "benchmark_tests.h"
+#include <iostream>
+#include <cmath>
+
+int main()
+{   
+    vector<vector<double>> results_scalar;
+    results_scalar.push_back(test_A(2000000, pow(10,3)));
+    results_scalar.push_back(test_A(1000000, pow(10,4)));
+    results_scalar.push_back(test_A(100000, pow(10,5)));
+    results_scalar.push_back(test_A(10000, pow(10,6)));
+    results_scalar.push_back(test_A(750, pow(10,7)));
+    results_scalar.push_back(test_A(125, pow(10,8)));
+
+
+    vector<vector<double>> results_sum;
+    results_sum.push_back(test_A_sum(3000000, pow(10,3)));
+    results_sum.push_back(test_A_sum(2000000, pow(10,4)));
+    results_sum.push_back(test_A_sum(1000000, pow(10,5)));
+    results_sum.push_back(test_A_sum(50000, pow(10,6)));
+    results_sum.push_back(test_A_sum(2000, pow(10,7)));
+    results_sum.push_back(test_A_sum(250, pow(10,8)));
+
+
+    test_B(100, 20000, 10000);
+
+    test_C(25, 500, 1000, 1500);
+
+    test_D(100, 100, 1000000);
+    
+
+
+    cout << endl << "###### Scalar ######" << endl;
+    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results_scalar.size(); ++i)
+        cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
+
+    cout << endl << "###### Sum ######" << endl;
+    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results_sum.size(); ++i)
+        cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
+
+
+
+    
+    // ###### Scalar ######
+    // Timing  GFLOPS  GiByte/s
+    // ------------------------------
+    // 3.4e-06 0.54    4.3
+    // 4.6e-06 4       32
+    // 1.6e-05 12      95
+    // 0.0011  1.7     13
+    // 0.0097  1.9     15
+    // 0.075   2.5     20
+
+
+    // ###### Sum ######
+    // Timing  GFLOPS  GiByte/s
+    // ------------------------------
+    // 5.5e-06 0.17    1.3
+    // 5.4e-06 1.7     14
+    // 1.5e-05 6.1     49
+    // 0.00013 7.2     57
+    // 0.0033  2.8     23
+    // 0.032   2.9     23
+
+    
+
+    
+    
+    // ######### NOT PARALLEL (from exercise sheet 2) #########
+    //      Timing  GFLOPS  GiByte/s
+    // ----------------------------------
+    // (A)  0.038   2.5     20
+    // (B)  0.13    2.9     23
+    // (C)  0.44    3.2     25
+    // (D)  0.19    1.5     12
+
+
+
+    return 0;
+}