Exercises_MarkusSchmidt/sheet5/4/main.cpp

#include "mylib.h"
#include <cassert>
#include <chrono>           // timing
#include <cmath>            // sqrt()
#include <cstdlib>          // atoi()
#include <cstring>          // strncmp()
#include <ctime>
#include <iostream>
#include <sstream>
#include "benchmark.h"
#include "omp.h"
using namespace std;
using namespace std::chrono;  // timing

int main(int argc, char **argv)
{
    const unsigned int NA = 1400000;
    const unsigned int NLOOPSA = 2000;
    //const unsigned int NLOOPS = 10;

    const unsigned int MC = 1000;
    int const NLOOPSC = 5;
// ---------- Benchmark A ----------

{


    vector<double> xA(NA), yA(NA);
    for (unsigned int i = 0; i < NA; ++i)
    {
        double xi= (i % 219) + 1;
        xA[i] = xi;
        yA[i] = 1.0 / xi;
    }

    auto tA1 = system_clock::now();
    double sA = 0.0, sumA = 0.0;
    for (unsigned int loop = 0; loop < NLOOPSA; ++loop)
    {
        sA = benchmark_A(xA, yA);
        sumA += sA;
    }
    auto tA2 = system_clock::now();

    auto durA = duration_cast<microseconds>(tA2 - tA1);
    double tA = static_cast<double>(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds

    cout << "\n===== Benchmark A =====\n";
    cout << "<xA,yA> = "  << sA << endl;
    cout << "Timing in sec. : " << tA << endl;
    cout << "GFLOPS         : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl;
    cout << "GiByte/s       : "
         << 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl;
}

// ---------- Benchmark B----------

{
    const unsigned int MB = 1700;
    const unsigned int NB = MB;
    const unsigned int NLOOPSB = 200;//50;

    vector<double> AB(MB * NB);
    vector<double> xB(NB);

    for (unsigned int i = 0; i < MB; ++i)
        for (unsigned int j = 0; j < NB; ++j)
            AB[i * NB + j] = (i+j) %219 +1;

    for (unsigned int j = 0; j < NB; ++j)
    {

        xB[j] = 1.0 / AB[17*NB+j];
    }

    vector<double> bB;
    auto tB1 = system_clock::now();
    double guardB = 0.0;
    for (unsigned int loop = 0; loop < NLOOPSB; ++loop)
    {
        bB = benchmark_B(AB, xB);
        guardB += bB[17];
    }
    auto tB2 = system_clock::now();

    auto durB = duration_cast<microseconds>(tB2 - tB1);
    double tB = static_cast<double>(durB.count()) / 1e6 / NLOOPSB;

    double flopsB = 2.0 * MB * NB;
    double bytesB = (MB * NB + NB  + MB)  * sizeof(double);

    cout << "\n===== Benchmark B =====\n";
    cout << guardB << endl;
    cout << "bytes: " << bytesB << endl;
    cout << "Timing in sec. : " << tB << endl;
    cout << "GFLOPS         : " << flopsB / tB / 1024 / 1024 / 1024 << endl;
    cout << "GiByte/s       : " << bytesB / tB / 1024 / 1024 / 1024 << endl;
}

// ---------- Benchmark C ----------

{

    const unsigned int LC = MC;
    const unsigned int NC = MC;


    vector<double> AC(MC * LC), BC(LC * NC);

    for (unsigned int i = 0; i < MC; ++i)
        for (unsigned int j = 0; j < LC; ++j)
            AC[i * LC + j] = (i+j) %219 +1;

    for (unsigned int i = 0; i < LC; ++i)
        for (unsigned int j = 0; j < NC; ++j)
            BC[i * NC + j] = (i+j) %219 +1;

    vector<double> CC;
    auto tC1 = system_clock::now();
    double guardC = 0.0;
    for (unsigned int loop = 0; loop < NLOOPSC; ++loop)
    {
        CC = benchmark_C(AC, BC, MC);
        guardC += CC[0];
    }
    auto tC2 = system_clock::now();

    auto durC = duration_cast<microseconds>(tC2 - tC1);
    double tC = static_cast<double>(durC.count()) / 1e6 / NLOOPSC;

    double flopsC = 2.0 * MC * LC * NC;
    double bytesC = (MC * LC  + LC * NC  + MC * NC)* sizeof(double);

    cout << "\n===== Benchmark C =====\n";
    cout << guardC << endl;
    cout << "bytes: " << bytesC << endl;
    cout << "Timing in sec. : " << tC << endl;
    cout << "GFLOPS         : " << flopsC / tC / 1024 / 1024 / 1024 << endl;
    cout << "GiByte/s       : " << bytesC / tC / 1024 / 1024 / 1024 << endl;
}

// ---------- Benchmark D----------


{
    const unsigned int ND = 2000000;
    const unsigned int p  = 14;         // degree p-1 = 15
    const unsigned int NLOOPSD = 100;
    vector<double> coeff(p, 0.0);
    vector<double> xD(ND);

    for (unsigned int k = 0; k < p; ++k)
        coeff[k] = k%219+1;

    for (unsigned int i = 0; i < ND; ++i)
        xD[i] = i%219+1;

    vector<double> yD;
    auto tD1 = system_clock::now();
    double guardD = 0.0;
    for (unsigned int loop = 0; loop < NLOOPSD; ++loop)
    {

        yD = benchmark_D(coeff, xD);
        guardD += yD[0];
    }
    auto tD2 = system_clock::now();

    auto durD = duration_cast<microseconds>(tD2 - tD1);
    double tD = static_cast<double>(durD.count()) / 1e6 / NLOOPSD;


    double flopsD = ND * 2 * p;
    double bytesD = (p + 2 * ND)*sizeof(double);

    cout << "\n===== Benchmark D =====\n";
    cout << guardD << endl;
    cout << "bytes: " << bytesD << endl;
    cout << "Timing in sec. : " << tD << endl;
    cout << "GFLOPS         : " << flopsD / tD / 1024 / 1024 / 1024 << endl;
    cout << "GiByte/s       : " << bytesD / tD / 1024 / 1024 / 1024 << endl;
}


const int NLOOPS = 20;

for (int k = 3; k <= 8; ++k)
{
    unsigned int n = pow(10.0, k);

    vector<double> x(n), y(n);
    for (unsigned int  i = 0; i < n; ++i)
    {
        double xi = (i % 219) + 1;
        x[i] = xi;
        y[i] = 1.0 / xi;
    }

    double s1_guard = 0.0, s2_guard = 0.0;
    double ip1_guard = 0.0, ip2_guard = 0.0;

    // ---- SUM benchmark (sequential) ----
    double t0 = omp_get_wtime();
    for (int r = 0; r < NLOOPS; ++r)
        s1_guard += benchmark_A_sum_old(x);
    double t_sum_seq = (omp_get_wtime() - t0) / NLOOPS;

    // ---- SUM benchmark (parallel) ----
    t0 = omp_get_wtime();
    for (int r = 0; r < NLOOPS; ++r)
        s2_guard += benchmark_A_sum(x);
    double t_sum_omp = (omp_get_wtime() - t0) / NLOOPS;

    double sum_speedup = t_sum_seq / t_sum_omp;

    // ---- INNER PRODUCT benchmark (sequential) ----
    t0 = omp_get_wtime();
    for (int r = 0; r < NLOOPS; ++r)
        ip1_guard += benchmark_A_old(x, y);
    double t_inner_seq = (omp_get_wtime() - t0) / NLOOPS;

    // ---- INNER PRODUCT benchmark (parallel) ----
    t0 = omp_get_wtime();
    for (int r = 0; r < NLOOPS; ++r)
        ip2_guard += benchmark_A(x, y);
    double t_inner_omp = (omp_get_wtime() - t0) / NLOOPS;

    double inner_speedup = t_inner_seq / t_inner_omp;

    // ---- Print results ----
    std::cout << "k = " << k << " (n = 10^" << k << " = " << n << ")\n";
    std::cout << "SUM    seq: " << t_sum_seq   << " s,  omp: " << t_sum_omp
              << " s,  speedup = " << sum_speedup   << '\n';
    std::cout << "INNER  seq: " << t_inner_seq << " s,  omp: " << t_inner_omp
              << " s,  speedup = " << inner_speedup << '\n';
    std::cout << "guards: "
              << s1_guard << ", " << s2_guard << ", "
              << ip1_guard << ", " << ip2_guard << "\n\n";
}


return 0;
}