Pushing everything again, accidentally deleted my remote repository

2025-12-09 22:06:13 +01:00 · 2025-12-09 22:06:13 +01:00 · 1bee3e8e5b
commit 1bee3e8e5b
101 changed files with 9428 additions and 0 deletions
--- a/ex5/ex5_4/Makefile
+++ b/ex5/ex5_4/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/ex5/ex5_4/benchmark_tests.cpp
+++ b/ex5/ex5_4/benchmark_tests.cpp
@ -0,0 +1,375 @@
+#include "benchmark_tests.h"
+#include "benchmarks.h"
+#include <chrono>
+#include <iostream>
+#include <math.h>       
+using namespace std::chrono; 
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N)
+{
+    cout << "#################### (A) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        x[i] = i % 219 + 1;
+        y[i] = 1.0/x[i];
+    }
+
+
+    cout << "\nStart Benchmarking scalar\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double check(0.0),ss(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        check = scalar_parallel(x, y);
+        ss += check;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n <x,y> = " << check << endl;
+    if (static_cast<unsigned int>(check) != N)
+        cout << "  !!   W R O N G  result   !!\n";
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
+{
+    cout << "#################### (A) sum ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        x[i] = 1;
+    }
+
+
+    cout << "\nStart Benchmarking sum\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double check(0.0),ss(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        check = sum(x);
+        ss += check;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n <x,y> = " << check << endl;
+    if (static_cast<unsigned int>(check) != N)
+        cout << "  !!   W R O N G  result   !!\n";
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
+{
+    cout << "#################### (B) ####################" << endl;
+
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\nM = " << M << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*N);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[N*i + j] = (i + j) % 219 + 1;
+
+
+    for (size_t j = 0; j < N; ++j)
+    {
+        x[j] = 1.0/A[N*17 + j];
+    }
+
+    cout << "\nStart Benchmarking MatVec\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> b(M);
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        b = MatVec_parallel(A, x);
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n <A[17,*],x> = " << b[17] << endl;
+    if (static_cast<size_t>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
+{
+    cout << "#################### (C) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nL = " << L << endl;
+    cout << "\nM = " << M << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*L);
+    vector<double> B(L*N);
+
+    cout.precision(2);
+    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t k = 0; k < L; ++k)
+            A[L*i + k] = (i + k) % 219 + 1;
+
+    for (size_t k = 0; k < L; ++k)
+        for (size_t j = 0; j < N; ++j)
+            B[N*k + j] = 1.0/A[L*17 + k];
+
+
+    cout << "\nStart Benchmarking MatMat\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> C(M*N);
+    double check;
+    double check_sum = 0;
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        C = MatMat_parallel(A, B, L);
+
+        check = C[N*17];
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+    cout << check_sum;
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n C[17,0] = " << check << endl;
+    if (static_cast<unsigned int>(check) != L)
+    {
+        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
+    }
+    cout << endl;
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
+{
+    cout << "#################### (D) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\np = " << p << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> a(p + 1, 0);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t j = 0; j < N; ++j)
+        x[j] = 1.0*j;
+    
+    for (size_t k = 0; k < p + 1; ++k)
+        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
+
+
+
+    cout << "\nStart Benchmarking poly\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> y(N);
+    double check;
+    double check_sum;
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        y = poly_parallel(a, x);
+        check = y[0];
+
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n poly(" << x[0] << ") = " << check << endl;
+    if (abs(check - 1.0) > 1.0/1e6)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    
+
+    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
--- a/ex5/ex5_4/benchmark_tests.h
+++ b/ex5/ex5_4/benchmark_tests.h
@ -0,0 +1,13 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N);
+
+vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex5/ex5_4/benchmarks.cpp
+++ b/ex5/ex5_4/benchmarks.cpp
@ -0,0 +1,141 @@
+#include "benchmarks.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <omp.h>
+
+// (A) Inner product of two vectors (from skalar_stl)
+double scalar_parallel(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
+#pragma omp parallel for shared(x, y, N) reduction(+:sum)
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+// (A) Vector entry sum
+double sum(vector<double> const &x)
+{
+    double sum = 0.0;
+#pragma omp parallel for shared(x) reduction(+:sum)
+    for (size_t i = 0; i < x.size(); ++i)
+    {
+        sum += x[i];
+    }
+    return sum;
+}
+
+
+// (B) Matrix-vector product (from intro_vector_densematrix)
+vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;          
+
+    vector<double> b(M);    
+
+#pragma omp parallel for shared(A, x, N, M, b)
+    for (size_t i = 0; i < M; ++i)
+    {
+        double tmp = 0.0;
+        for (size_t j = 0; j < N; ++j)
+            tmp += A[N*i + j] * x[j];
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+
+// (C) Matrix-matrix product
+vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+            
+#pragma omp parallel for shared(A, B, M, N, L, C)
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t k = 0; k < L; ++k)
+        {
+            for (size_t j = 0; j < N; ++j)
+            {
+                C[N*i + j] += A[L*i + k]*B[N*k + j];
+            }
+            
+        }
+    }
+
+    return C;
+}
+
+
+// (D) Evaluation of a polynomial function
+vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
+{
+    size_t const N = x.size();
+    size_t const p = a.size() - 1;
+    vector<double> y(N, 0);
+
+#pragma omp parallel for shared(a, x, N, p, y)
+    for (size_t i = 0; i < N; ++i)
+    {
+        double x_temp = x[i];
+        double y_temp = 0;
+        for (size_t k = 0; k < p + 1; ++k)
+        {
+            y_temp += x_temp*y_temp + a[p - k];
+        }
+        y[i] = y_temp;
+    }
+
+    return y;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex5/ex5_4/benchmarks.h
+++ b/ex5/ex5_4/benchmarks.h
@ -0,0 +1,55 @@
+#pragma once
+#include <vector>
+using namespace std;
+
+/**     (A) Inner product of two vectors (from skalar_stl)
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_parallel(vector<double> const &x, vector<double> const &y);
+
+
+/**     (A) Sum entries of vector
+	@param[in] x	vector
+	@return 	    sum
+*/
+double sum(vector<double> const &x);
+
+
+/** 	(B) Matrix-vector product (from intro_vector_densematrix)
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
+
+
+/** 	(C) Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+
+/** 	(D) Evaluation of a polynomial function using Horner's scheme
+ * 	@param[in] a	coefficient vector
+ *  @param[in] x	vector with input values
+ *
+ *	@return    vector with output values
+*/
+vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
+
+
+
+
+
+
+
+
+
+
--- a/ex5/ex5_4/main.cpp
+++ b/ex5/ex5_4/main.cpp
@ -0,0 +1,84 @@
+#include "benchmark_tests.h"
+#include <iostream>
+#include <cmath>
+
+int main()
+{   
+    vector<vector<double>> results_scalar;
+    results_scalar.push_back(test_A(2000000, pow(10,3)));
+    results_scalar.push_back(test_A(1000000, pow(10,4)));
+    results_scalar.push_back(test_A(100000, pow(10,5)));
+    results_scalar.push_back(test_A(10000, pow(10,6)));
+    results_scalar.push_back(test_A(750, pow(10,7)));
+    results_scalar.push_back(test_A(125, pow(10,8)));
+
+
+    vector<vector<double>> results_sum;
+    results_sum.push_back(test_A_sum(3000000, pow(10,3)));
+    results_sum.push_back(test_A_sum(2000000, pow(10,4)));
+    results_sum.push_back(test_A_sum(1000000, pow(10,5)));
+    results_sum.push_back(test_A_sum(50000, pow(10,6)));
+    results_sum.push_back(test_A_sum(2000, pow(10,7)));
+    results_sum.push_back(test_A_sum(250, pow(10,8)));
+
+
+    test_B(100, 20000, 10000);
+
+    test_C(25, 500, 1000, 1500);
+
+    test_D(100, 100, 1000000);
+    
+
+
+    cout << endl << "###### Scalar ######" << endl;
+    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results_scalar.size(); ++i)
+        cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
+
+    cout << endl << "###### Sum ######" << endl;
+    cout << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results_sum.size(); ++i)
+        cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
+
+
+
+    
+    // ###### Scalar ######
+    // Timing  GFLOPS  GiByte/s
+    // ------------------------------
+    // 3.4e-06 0.54    4.3
+    // 4.6e-06 4       32
+    // 1.6e-05 12      95
+    // 0.0011  1.7     13
+    // 0.0097  1.9     15
+    // 0.075   2.5     20
+
+
+    // ###### Sum ######
+    // Timing  GFLOPS  GiByte/s
+    // ------------------------------
+    // 5.5e-06 0.17    1.3
+    // 5.4e-06 1.7     14
+    // 1.5e-05 6.1     49
+    // 0.00013 7.2     57
+    // 0.0033  2.8     23
+    // 0.032   2.9     23
+
+    
+
+    
+    
+    // ######### NOT PARALLEL (from exercise sheet 2) #########
+    //      Timing  GFLOPS  GiByte/s
+    // ----------------------------------
+    // (A)  0.038   2.5     20
+    // (B)  0.13    2.9     23
+    // (C)  0.44    3.2     25
+    // (D)  0.19    1.5     12
+
+
+
+    return 0;
+}