sheet 5 not tested

2025-12-02 20:28:11 +01:00 · 2025-12-02 20:28:11 +01:00 · 64c7aed176
commit 64c7aed176
parent 9c16d18f8f
169 changed files with 225337 additions and 0 deletions
--- a/sheet5/4/.vscode/settings.json
+++ b/sheet5/4/.vscode/settings.json
@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "ostream": "cpp",
+        "iostream": "cpp"
+    }
+}
--- a/sheet5/4/.vscode/settings.json:Zone.Identifier
+++ b/sheet5/4/.vscode/settings.json:Zone.Identifier
--- a/sheet5/4/Doxyfile
+++ b/sheet5/4/Doxyfile
--- a/sheet5/4/Doxyfile:Zone.Identifier
+++ b/sheet5/4/Doxyfile:Zone.Identifier
--- a/sheet5/4/Makefile
+++ b/sheet5/4/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp benchmark.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/sheet5/4/Makefile:Zone.Identifier
+++ b/sheet5/4/Makefile:Zone.Identifier
--- a/sheet5/4/bench
+++ b/sheet5/4/bench
--- a/sheet5/4/benchmark.cpp
+++ b/sheet5/4/benchmark.cpp
@ -0,0 +1,115 @@
+#include <iostream>
+#include <vector>   
+#include <cmath>
+using namespace std;
+
+
+// Inner product
+double benchmark_A(const vector<double> &x, const vector<double> &y)
+{
+    double sum = 0.0;
+    #pragma omp parallel for reduction(+:sum)
+     for (unsigned int i = 0; i < x.size(); i++)
+    {
+        sum += x[i]*y[i];
+    }
+    return sum;
+}
+
+// Inner product
+double benchmark_A_sum(const vector<double> &x)
+{
+    double sum = 0.0;
+    #pragma omp parallel for reduction(+:sum)
+     for (unsigned int i = 0; i < x.size(); i++)
+    {
+        sum += x[i];
+    }
+    return sum;
+}
+
+//Matrix-vector product
+vector<double> benchmark_B(const vector<double> &A, const vector<double> &x)
+{
+    unsigned int N = x.size();
+    unsigned int M = A.size() / N;
+    vector<double> b(M, 0.0);
+
+    #pragma omp parallel for
+    for (unsigned int i = 0; i < M; i++)
+    {
+        double bi = 0.0;
+        for (unsigned int j = 0; j < N; j++)
+        {
+            bi += A[i*N+j]*x[j];
+        }
+        b[i] = bi;
+    }
+
+    return b;
+}
+
+
+//Matrix-Matrix product
+vector<double> benchmark_C(const vector<double> &A, const vector<double> &B, unsigned int M)
+{
+    unsigned int L = A.size()/M;
+    unsigned int N = B.size()/L;
+    vector<double> C(M*N,0.0);
+    #pragma omp parallel for collapse(2)
+     for (unsigned int i = 0; i < M; i++)
+    {
+        for (unsigned int j = 0; j < N; j++)
+        {
+            double sum = 0.0;
+            for (unsigned int k = 0; k < L; k++)
+            {
+                sum += A[i*L+k]*B[k*N+j];
+            }
+            C[i*N+j] = sum;
+        }
+    }
+
+    return C;
+
+}
+
+//polynomial evaluation
+vector<double> benchmark_D(const vector<double>& coeff, const vector<double>& x)
+{
+    unsigned int p  = coeff.size();   // p coefficients, degree p-1
+    unsigned int N  = x.size();
+    vector<double> y(N);
+
+#pragma omp parallel for
+    for (unsigned int i = 0; i < N; i++){
+        double yi = coeff[p-1];
+        double xi = x[i];
+        for(int j=p-2; j>=0; --j)
+        {
+            yi = yi*xi+coeff[j];
+        }
+        y[i] = yi;
+    }
+    return y;
+}
+
+
+double benchmark_A_old(const vector<double> &x, const vector<double> &y)
+{
+    double sum = 0.0;
+     for (unsigned int i = 0; i < x.size(); i++)
+    {
+        sum += x[i]*y[i];
+    }
+    return sum;
+}
+double benchmark_A_sum_old(const vector<double> &x)
+{
+    double sum = 0.0;
+     for (unsigned int i = 0; i < x.size(); i++)
+    {
+        sum += x[i];
+    }
+    return sum;
+}
--- a/sheet5/4/benchmark.cpp:Zone.Identifier
+++ b/sheet5/4/benchmark.cpp:Zone.Identifier
--- a/sheet5/4/benchmark.h
+++ b/sheet5/4/benchmark.h
@ -0,0 +1,27 @@
+#ifndef BENCHMARK_H
+#define BENCHMARK_H
+
+#include <vector>
+using namespace std;
+
+double benchmark_A(const vector<double> &x,
+                   const vector<double> &y);
+    double benchmark_A_sum(const vector<double> &x);
+
+vector<double> benchmark_B(const vector<double> &A,
+                                const vector<double> &x);
+
+vector<double> benchmark_C(const vector<double> &A,
+                                const vector<double> &B,
+                                unsigned int M);
+
+vector<double> benchmark_D(const vector<double> &coefficients,
+                                const vector<double> &x);
+double benchmark_A_old(const vector<double> &x,
+                   const vector<double> &y);
+    double benchmark_A_sum_old(const vector<double> &x);
+                                
+
+
+
+#endif 
--- a/sheet5/4/benchmark.h:Zone.Identifier
+++ b/sheet5/4/benchmark.h:Zone.Identifier
--- a/sheet5/4/benchmark.o
+++ b/sheet5/4/benchmark.o
--- a/sheet5/4/main.GCC_
+++ b/sheet5/4/main.GCC_
--- a/sheet5/4/main.cpp
+++ b/sheet5/4/main.cpp
@ -0,0 +1,234 @@
+#include "mylib.h"
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>            // sqrt()
+#include <cstdlib>          // atoi()
+#include <cstring>          // strncmp()
+#include <ctime>
+#include <iostream>
+#include <sstream>
+#include "benchmark.h"
+#include "omp.h"
+using namespace std;
+using namespace std::chrono;  // timing
+
+int main(int argc, char **argv)
+{
+    const unsigned int NA = 1400000;     
+    const unsigned int NLOOPSA = 2000;     
+    //const unsigned int NLOOPS = 10;  
+    
+    const unsigned int MC = 1000;
+    int const NLOOPSC = 5;
+// ---------- Benchmark A ----------
+
+{
+    
+    
+    vector<double> xA(NA), yA(NA);
+    for (unsigned int i = 0; i < NA; ++i)
+    {
+        double xi= (i % 219) + 1;
+        xA[i] = xi;
+        yA[i] = 1.0 / xi;
+    }
+
+    auto tA1 = system_clock::now();
+    double sA = 0.0, sumA = 0.0;
+    for (unsigned int loop = 0; loop < NLOOPSA; ++loop)
+    {
+        sA = benchmark_A(xA, yA);
+        sumA += sA;
+    }
+    auto tA2 = system_clock::now();
+
+    auto durA = duration_cast<microseconds>(tA2 - tA1);
+    double tA = static_cast<double>(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds
+
+    cout << "\n===== Benchmark A =====\n";
+    cout << "<xA,yA> = "  << sA << endl;
+    cout << "Timing in sec. : " << tA << endl;
+    cout << "GFLOPS         : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : "
+         << 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl;
+}
+
+// ---------- Benchmark B----------
+
+{
+    const unsigned int MB = 1700;
+    const unsigned int NB = MB;
+    const unsigned int NLOOPSB = 200;//50;
+
+    vector<double> AB(MB * NB);
+    vector<double> xB(NB);
+
+    for (unsigned int i = 0; i < MB; ++i)
+        for (unsigned int j = 0; j < NB; ++j)
+            AB[i * NB + j] = (i+j) %219 +1;
+
+    for (unsigned int j = 0; j < NB; ++j)
+    {
+        
+        xB[j] = 1.0 / AB[17*NB+j];
+    }
+
+    vector<double> bB;
+    auto tB1 = system_clock::now();
+    double guardB = 0.0;
+    for (unsigned int loop = 0; loop < NLOOPSB; ++loop)
+    {
+        bB = benchmark_B(AB, xB);
+        guardB += bB[17];
+    }
+    auto tB2 = system_clock::now();
+
+    auto durB = duration_cast<microseconds>(tB2 - tB1);
+    double tB = static_cast<double>(durB.count()) / 1e6 / NLOOPSB;
+
+    double flopsB = 2.0 * MB * NB;  
+    double bytesB = (MB * NB + NB  + MB)  * sizeof(double);
+
+    cout << "\n===== Benchmark B =====\n";
+    cout << guardB << endl;
+    cout << "bytes: " << bytesB << endl;
+    cout << "Timing in sec. : " << tB << endl;
+    cout << "GFLOPS         : " << flopsB / tB / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << bytesB / tB / 1024 / 1024 / 1024 << endl;
+}
+
+// ---------- Benchmark C ----------
+
+{
+    
+    const unsigned int LC = MC;
+    const unsigned int NC = MC;
+
+   
+    vector<double> AC(MC * LC), BC(LC * NC);
+
+    for (unsigned int i = 0; i < MC; ++i)
+        for (unsigned int j = 0; j < LC; ++j)
+            AC[i * LC + j] = (i+j) %219 +1;
+
+    for (unsigned int i = 0; i < LC; ++i)
+        for (unsigned int j = 0; j < NC; ++j)
+            BC[i * NC + j] = (i+j) %219 +1;
+
+    vector<double> CC;
+    auto tC1 = system_clock::now();
+    double guardC = 0.0;
+    for (unsigned int loop = 0; loop < NLOOPSC; ++loop)
+    {
+        CC = benchmark_C(AC, BC, MC);
+        guardC += CC[0];
+    }
+    auto tC2 = system_clock::now();
+
+    auto durC = duration_cast<microseconds>(tC2 - tC1);
+    double tC = static_cast<double>(durC.count()) / 1e6 / NLOOPSC;
+
+    double flopsC = 2.0 * MC * LC * NC;
+    double bytesC = (MC * LC  + LC * NC  + MC * NC)* sizeof(double);
+
+    cout << "\n===== Benchmark C =====\n";
+    cout << guardC << endl;
+    cout << "bytes: " << bytesC << endl;
+    cout << "Timing in sec. : " << tC << endl;
+    cout << "GFLOPS         : " << flopsC / tC / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << bytesC / tC / 1024 / 1024 / 1024 << endl;
+}
+
+// ---------- Benchmark D----------
+
+
+{
+    const unsigned int ND = 2000000;
+    const unsigned int p  = 14;         // degree p-1 = 15
+    const unsigned int NLOOPSD = 100;
+    vector<double> coeff(p, 0.0);
+    vector<double> xD(ND);
+
+    for (unsigned int k = 0; k < p; ++k)
+        coeff[k] = k%219+1;
+
+    for (unsigned int i = 0; i < ND; ++i)
+        xD[i] = i%219+1;
+
+    vector<double> yD; 
+    auto tD1 = system_clock::now();
+    double guardD = 0.0;
+    for (unsigned int loop = 0; loop < NLOOPSD; ++loop)
+    {
+        
+        yD = benchmark_D(coeff, xD);
+        guardD += yD[0];
+    }
+    auto tD2 = system_clock::now();
+
+    auto durD = duration_cast<microseconds>(tD2 - tD1);
+    double tD = static_cast<double>(durD.count()) / 1e6 / NLOOPSD;
+
+   
+    double flopsD = ND * 2 * p;
+    double bytesD = (p + 2 * ND)*sizeof(double);
+
+    cout << "\n===== Benchmark D =====\n";
+    cout << guardD << endl;
+    cout << "bytes: " << bytesD << endl;
+    cout << "Timing in sec. : " << tD << endl;
+    cout << "GFLOPS         : " << flopsD / tD / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << bytesD / tD / 1024 / 1024 / 1024 << endl;
+}
+
+
+
+for (int k = 3; k <= 8; ++k)
+    {
+        size_t n = (size_t)pow(10, k);
+
+        vector<double> x(n), y(n);
+        for (unsigned int i = 0; i < n; ++i)
+        {
+            double xi= (i % 219) + 1;
+            x[i] = xi;
+            y[i] = 1.0 / xi;
+        }
+
+        // ---- SUM benchmark (sequential) ----
+        double t0 = omp_get_wtime();
+        double s1 = benchmark_A_sum_old(x);
+        double t_sum_seq = omp_get_wtime() - t0;
+
+        // ---- SUM benchmark (parallel) ----
+        t0 = omp_get_wtime();
+        double s2 = benchmark_A_sum(x);
+        double t_sum_omp = omp_get_wtime() - t0;
+
+        double sum_speedup = t_sum_seq / t_sum_omp;
+
+
+        // ---- INNER PRODUCT benchmark (sequential) ----
+        t0 = omp_get_wtime();
+        double ip1 = benchmark_A_old(x, y);
+        double t_inner_seq = omp_get_wtime() - t0;
+
+        // ---- INNER PRODUCT benchmark (parallel) ----
+        t0 = omp_get_wtime();
+        double ip2 = benchmark_A(x, y);
+        double t_inner_omp = omp_get_wtime() - t0;
+
+        double inner_speedup = t_inner_seq / t_inner_omp;
+
+        // ---- Print results ----
+        cout << k << endl;
+            cout << t_sum_seq << ", " << t_sum_omp << ", " << sum_speedup << endl;
+           cout << t_inner_seq << ", " << t_inner_omp << ", " << inner_speedup << endl;
+        cout << endl;
+    }
+
+
+
+return 0;
+}
+
--- a/sheet5/4/main.cpp:Zone.Identifier
+++ b/sheet5/4/main.cpp:Zone.Identifier
--- a/sheet5/4/main.o
+++ b/sheet5/4/main.o
--- a/sheet5/4/mylib.cpp
+++ b/sheet5/4/mylib.cpp
@ -0,0 +1,65 @@
+#include "mylib.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <vector>
+
+#ifdef __INTEL_CLANG_COMPILER
+#pragma message(" ##########  Use of MKL  ###############")
+#include <mkl.h>
+#else
+#pragma message(" ##########  Use of CBLAS  ###############")
+//extern "C"
+//{
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>             // Lapack
+//}
+#endif
+
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double scalar_cblas(vector<double> const &x, vector<double> const &y)
+{
+    int const asize = static_cast<int>(size(x));
+    int const bsize = static_cast<int>(size(y));
+    assert(asize == bsize); // switch off via compile flag: -DNDEBUG
+	return cblas_ddot(asize,x.data(),1,y.data(),1);    
+    //assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+	//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
+}
+
+float scalar_cblas(vector<float> const &x, vector<float> const &y)
+{
+    int const asize = static_cast<int>(size(x));
+    int const bsize = static_cast<int>(size(y));
+    assert(asize == bsize); // switch off via compile flag: -DNDEBUG
+	return cblas_sdot(asize,x.data(),1,y.data(),1);    
+    //assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+	//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
+}
+
+
+double norm(vector<double> const &x)
+{
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * x[i];
+    }
+    return std::sqrt(sum);
+}
+
--- a/sheet5/4/mylib.cpp:Zone.Identifier
+++ b/sheet5/4/mylib.cpp:Zone.Identifier
--- a/sheet5/4/mylib.h
+++ b/sheet5/4/mylib.h
@ -0,0 +1,30 @@
+#ifndef FILE_MYLIB
+#define FILE_MYLIB
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+
+/** 	Inner product using BLAS routines
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
+float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
+
+
+/** 	L_2 Norm of a vector
+	@param[in] x	vector
+	@return 	    resulting Euclidian norm <x,y>
+*/
+double norm(std::vector<double> const &x);
+
+
+
+
+#endif
--- a/sheet5/4/mylib.h:Zone.Identifier
+++ b/sheet5/4/mylib.h:Zone.Identifier
--- a/sheet5/4/mylib.o
+++ b/sheet5/4/mylib.o
--- a/sheet5/4/output.txt
+++ b/sheet5/4/output.txt
@ -0,0 +1,51 @@
+g++ -c -g -O0 -funroll-all-loops -std=c++17 -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -Wredundant-decls -Winline -fmax-errors=1  -flto -o main.o main.cpp
+g++ -c -g -O0 -funroll-all-loops -std=c++17 -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -Wredundant-decls -Winline -fmax-errors=1  -flto -o mylib.o mylib.cpp
+g++ -c -g -O0 -funroll-all-loops -std=c++17 -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -Wredundant-decls -Winline -fmax-errors=1  -flto -o benchmark.o benchmark.cpp
+g++  main.o mylib.o benchmark.o  -g -O0  -llapack -lblas -flto -o main.GCC_
+./main.GCC_ 
+
+===== Benchmark A =====
+<xA,yA> = 1.4e+06
+Timing in sec. : 0.00893637
+GFLOPS         : 0.291808
+GiByte/s       : 2.33446
+
+===== Benchmark B =====
+340000
+bytes: 2.31472e+07
+Timing in sec. : 0.0133897
+GFLOPS         : 0.402029
+GiByte/s       : 1.61001
+
+===== Benchmark C =====
+7.37196e+07
+bytes: 2.4e+07
+Timing in sec. : 8.67235
+GFLOPS         : 0.21478
+GiByte/s       : 0.00257736
+
+===== Benchmark D =====
+10500
+bytes: 3.20001e+07
+Timing in sec. : 0.101087
+GFLOPS         : 0.515935
+GiByte/s       : 0.294821
+
+===== Benchmark 5A =====
+NORM = 150114
+Timing in sec. : 0.00703533
+GFLOPS         : 0.370658
+GiByte/s       : 1.48263
+
+===== Benchmark 5B =====
+<xA,yA> = 1.4e+06
+Timing in sec. : 0.0108377
+GFLOPS         : 0.601533
+GiByte/s       : 1.92491
+
+===== Benchmark 5C =====
+7.37196e+07
+bytes: 2.4e+07
+Timing in sec. : 15.2407
+GFLOPS         : 0.122215
+GiByte/s       : 0.00146658
--- a/sheet5/4/output.txt:Zone.Identifier
+++ b/sheet5/4/output.txt:Zone.Identifier
--- a/sheet5/4/small_Doxyfile
+++ b/sheet5/4/small_Doxyfile
--- a/sheet5/4/small_Doxyfile:Zone.Identifier
+++ b/sheet5/4/small_Doxyfile:Zone.Identifier