scf_celebic/ex3/code/main.cpp

// Compile with "make" / "make run" or
// g++ *.cpp -O3 -ffast-math -lopenblas -llapacke -o main


#include "task_3.h"
#include "task_4+6.h"
#include "task_5.h"
#include "task_7.h"
#include "timing.h"
#include <cblas.h>               // cBLAS Library
#include <iomanip>
#include <iostream>
#include <lapacke.h>

void task_1() {
    printf("\n\n-------------- Task 1 --------------\n\n");
    printf("See comment in main.cpp");

    // -------------------------------------------------------------
    // STREAM version $Revision: 5.10 $
    // -------------------------------------------------------------
    // This system uses 8 bytes per array element.
    // -------------------------------------------------------------
    // Array size = 80000000 (elements), Offset = 0 (elements)
    // Memory per array = 610.4 MiB (= 0.6 GiB).
    // Total memory required = 1831.1 MiB (= 1.8 GiB).
    // Each kernel will be executed 20 times.
    //     The *best* time for each kernel (excluding the first iteration)
    //     will be used to compute the reported bandwidth.
    // -------------------------------------------------------------
    // Your clock granularity/precision appears to be 1 microseconds.
    // Each test below will take on the order of 116886 microseconds.
    //     (= 116886 clock ticks)
    // Increase the size of the arrays if this shows that
    // you are not getting at least 20 clock ticks per test.
    // -------------------------------------------------------------
    // WARNING -- The above is only a rough guideline.
    // For best results, please be sure you know the
    // precision of your system timer.
    // -------------------------------------------------------------
    // Function    Best Rate MB/s  Avg time     Min time     Max time
    // Copy:           29569.4     0.048585     0.043288     0.059164
    // Scale:          17644.0     0.082248     0.072546     0.102548
    // Add:            21030.1     0.100620     0.091298     0.124700
    // Triad:          21230.7     0.100758     0.090435     0.120631
    // -------------------------------------------------------------
    // Solution Validates: avg error less than 1.000000e-13 on all three arrays
    // -------------------------------------------------------------
    // ./flops.exe

    //     FLOPS C Program (Double Precision), V2.0 18 Dec 1992

    //     Module     Error        RunTime      MFLOPS
    //                             (usec)
    //         1      4.0146e-13      0.0024   5827.9076
    //         2     -1.4166e-13      0.0007  10037.8942
    //         3      4.7184e-14      0.0039   4371.9185
    //         4     -1.2557e-13      0.0034   4355.5711
    //         5     -1.3800e-13      0.0066   4415.6439
    //         6      3.2380e-13      0.0065   4441.6299
    //         7     -8.4583e-11      0.0053   2277.1707
    //         8      3.4867e-13      0.0069   4367.6094

    //     Iterations      =  512000000
    //     NullTime (usec) =     0.0000
    //     MFLOPS(1)       =  7050.6178
    //     MFLOPS(2)       =  3461.6233
    //     MFLOPS(3)       =  4175.0442
    //     MFLOPS(4)       =  4389.7311
}

void task_2() {
    printf("\n\n-------------- Task 2 --------------\n\n");
    printf("See comment in main.cpp");

    // Memory needed (double 64-bit, 8 bytes):
    // (A) (2N + 1) * 8 bytes
    // (B) (M*N + M + N) * 8 bytes
    // (C) (M*L + L*N + M*N) * 8 bytes
    // (D) (N + N + p) * 8 bytes

    // Floating point operations:
    // (A) 2N
    // (B) M * 2N
    // (C) M * 2L * N
    // (D) 2 * N * p (Horner Schema)

    // Read/Write operations:
    // (A) Read: 2N         Write: 1
    // (B) Read: M*2N       Write: M*N
    // (C) Read: M*2L*N     Write: M*L*N
    // (D) Read: 2*N*p      Write: N*P
}

void task_3() {
    printf("\n\n-------------- Task 3 --------------\n\n");
    printf("Functions implemented in task_3.cpp");
}

void task_4(bool cblas = false) {
    if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
    size_t M, N, L, p, NLOOPS;

    { //          Scalar product
    printf("----- Benchmark (A) -----\n");
    // Initialization
        N = 50'000'000;
        NLOOPS = 50;
        auto [x,y] = init_A(N);
    // Benchmark
        tic();
        benchmark_A(x, y, NLOOPS, cblas);
        double sec = toc() / NLOOPS;
    // Timings and Performance
        size_t memory = 2 * N;
        size_t flops  = 2 * N;
        print_performance(sec, memory, flops, sizeof(x[0]));
    printf("-------------------------\n");
    }

    { //      Matrix-Vector product
    printf("----- Benchmark (B) -----\n");
    // Initialization
        M = 8'000;
        N = 12'000;
        NLOOPS = 30;
        auto [A,x] = init_B(M,N);
    // Benchmark
        tic();
        benchmark_B(A, x, NLOOPS, cblas);
        double sec = toc() / NLOOPS;
    // Timings and Performance
        size_t memory = M*N + M + N;
        size_t flops  = 2 * M * N;
        print_performance(sec, memory, flops, sizeof(A[0]));
    printf("-------------------------\n");
    }

    { //     Matrix-Matrix product
    printf("----- Benchmark (C) -----\n");
    // Initialization
        M = 1'000;
        N = 2'000;
        L = 500;
        NLOOPS = 20;
        auto [A,B] = init_C(M,N,L);
    // Benchmark
        tic();
        benchmark_C(A, B, L, NLOOPS, cblas);
        double sec = toc() / NLOOPS;
    // Timings and Performance
        size_t memory = M*L + L*N + M*N;
        size_t flops  = M * 2*L * N;
        print_performance(sec, memory, flops, sizeof(A[0]));
    printf("-------------------------\n");
    }

    if (cblas == false)
    { //      Polynomial evaluation
    printf("----- Benchmark (D) -----\n");
    // Initialization
        N = 1'000'000;
        p = 200;
        NLOOPS = 20;
        auto [x,a] = init_D(N,p);
    // Benchmark
        tic();
        benchmark_D(x, a, NLOOPS);
        double sec = toc() / NLOOPS;
    // Timings  and Performance
        size_t memory = 2.0 * N;
        size_t flops  = 2.0 * N * p;
        print_performance(sec, memory, flops, sizeof(x[0]));
    printf("-------------------------\n");
    }
}

void task_5() {
    printf("\n\n-------------- Task 5 --------------\n\n");

    printf("----- Benchmark norm -----\n");
    // Initialization
        size_t N =50'000'000;
        size_t NLOOPS = 50;
        vector<double> x = init_norm(N);
    // Benchmark
        tic();
        benchmark_norm(x, NLOOPS);
        double sec = toc() / NLOOPS;
    // Timings and Performance
        size_t memory = N;
        size_t flops  = 2 * N;
        print_performance(sec, memory, flops, sizeof(x[0]));
    printf("-------------------------\n");
    printf("What do you observe? Why?\n");
    printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
}

void task_6() {
    printf("\n\n-------------- Task 6 --------------\n\n");
    printf("Benchmarks using cBLAS\n");
    task_4(true);
}

void task_7() {
    printf("\n\n-------------- Task 7 --------------\n\n");
    { // Check Ax=b
    size_t N=5, Nrhs=2;
    auto [A,b] = init_M(N,Nrhs);
    vector<double> A_og = A;

    printf("A =");
    print_matrix(A,N,N);
    printf("b =");
    print_matrix(b,N,Nrhs);

    int lda=N, ldb=Nrhs;
    vector<int> ipiv(N);
    LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
    LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);

    printf("L + U =");
    print_matrix(A,N,N);
    printf("x =");
    print_matrix(b,N,Nrhs);

    int ldc=Nrhs;
    vector<double> C(N*Nrhs);
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
    printf("Check solution:\nA * x = ");
    print_matrix(C,N,Nrhs);
    }
    // #################################
    // Benchmark

    cout << fixed << setprecision(4);
    size_t NLOOPS = 200;
    size_t Nrhs = 2000;
    cout << "Solution time per right hand side in milliseconds: sec*1000/Nrhs" << endl;
    cout << "N    = " << "     | 1      | 2      | 4      | 8      | 16     | 32 " << endl;
    cout <<       "------------|--------|--------|--------|--------|--------|-------" << endl;
    for (int k = 1; k < 10; ++k) {
        cout << "Nrhs = " << Nrhs*k;
        for (size_t N : {1, 2, 4, 8, 16, 32}) {
            tic();
            for (size_t i = 0; i < NLOOPS; ++i) {
                benchmark_lapacke(N, Nrhs*k);
            }
            double sec = toc()*1000 / (static_cast<double>(Nrhs)*k);
            cout << " | " << sec;
        }
        cout << endl;
    }
    printf("\nFor fixed n, the solution time per rhs stays roughly constant.");
}

int main() {
    task_1();
    task_2();
    task_3();
    task_4();
    task_5();
    task_6();
    task_7();
    printf("\n\n");

    return 0;
}